diff --git a/.devops/cann.Dockerfile b/.devops/cann.Dockerfile index 9df86d0489bf..36cee7bdb631 100644 --- a/.devops/cann.Dockerfile +++ b/.devops/cann.Dockerfile @@ -13,6 +13,20 @@ ARG APP_REVISION=N/A # BUILD STAGE # Compile all binary files and libraries # ============================================================================== +ARG NODE_VERSION=24 + +FROM docker.io/node:$NODE_VERSION AS web + +ARG APP_VERSION + +WORKDIR /app/tools/ui + +COPY tools/ui/package.json tools/ui/package-lock.json ./ +RUN npm ci + +COPY tools/ui/ ./ +RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build + FROM ${CANN_BASE_IMAGE} AS build # -- Install build dependencies -- @@ -26,6 +40,8 @@ WORKDIR /app # -- Copy project files -- COPY . . +COPY --from=web /app/tools/ui/dist tools/ui/dist + # -- Set CANN environment variables (required for compilation) -- # Using ENV instead of `source` allows environment variables to persist across the entire image layer ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest @@ -129,7 +145,7 @@ ENTRYPOINT ["/app/tools.sh"] # ============================================================================== FROM base AS light -COPY --from=build /app/full/llama-cli /app/full/llama-completion /app +COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app ENTRYPOINT [ "/app/llama-cli" ] @@ -140,7 +156,7 @@ FROM base AS server ENV LLAMA_ARG_HOST=0.0.0.0 -COPY --from=build /app/full/llama-server /app +COPY --from=build /app/full/llama /app/full/llama-server /app HEALTHCHECK --interval=5m CMD [ "curl", "-f", "http://localhost:8080/health" ] diff --git a/.devops/cpu.Dockerfile b/.devops/cpu.Dockerfile index a6dd6a516b6d..cb92343d6c07 100644 --- a/.devops/cpu.Dockerfile +++ b/.devops/cpu.Dockerfile @@ -3,7 +3,21 @@ ARG BUILD_DATE=N/A ARG APP_VERSION=N/A ARG APP_REVISION=N/A -FROM ubuntu:$UBUNTU_VERSION AS build +ARG NODE_VERSION=24 + +FROM docker.io/node:$NODE_VERSION AS web + +ARG APP_VERSION + +WORKDIR /app/tools/ui + +COPY tools/ui/package.json tools/ui/package-lock.json ./ +RUN npm ci + +COPY tools/ui/ ./ +RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build + +FROM docker.io/ubuntu:$UBUNTU_VERSION AS build ARG TARGETARCH @@ -16,6 +30,8 @@ WORKDIR /app COPY . . +COPY --from=web /app/tools/ui/dist tools/ui/dist + RUN if [ "$TARGETARCH" = "amd64" ] || [ "$TARGETARCH" = "arm64" ]; then \ cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \ else \ @@ -37,7 +53,7 @@ RUN mkdir -p /app/full \ && cp .devops/tools.sh /app/full/tools.sh ## Base image -FROM ubuntu:$UBUNTU_VERSION AS base +FROM docker.io/ubuntu:$UBUNTU_VERSION AS base ARG BUILD_DATE=N/A ARG APP_VERSION=N/A @@ -88,7 +104,7 @@ ENTRYPOINT ["/app/tools.sh"] ### Light, CLI only FROM base AS light -COPY --from=build /app/full/llama-cli /app/full/llama-completion /app +COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app WORKDIR /app @@ -99,7 +115,7 @@ FROM base AS server ENV LLAMA_ARG_HOST=0.0.0.0 -COPY --from=build /app/full/llama-server /app +COPY --from=build /app/full/llama /app/full/llama-server /app WORKDIR /app diff --git a/.devops/cuda.Dockerfile b/.devops/cuda.Dockerfile index 825df2a582cc..c9a498d538b3 100644 --- a/.devops/cuda.Dockerfile +++ b/.devops/cuda.Dockerfile @@ -1,29 +1,47 @@ ARG UBUNTU_VERSION=24.04 # This needs to generally match the container host's environment. ARG CUDA_VERSION=12.8.1 +ARG GCC_VERSION=14 # Target the CUDA build image -ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} +ARG BASE_CUDA_DEV_CONTAINER=docker.io/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} -ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} +ARG BASE_CUDA_RUN_CONTAINER=docker.io/nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} ARG BUILD_DATE=N/A ARG APP_VERSION=N/A ARG APP_REVISION=N/A +ARG NODE_VERSION=24 + +FROM docker.io/node:$NODE_VERSION AS web + +ARG APP_VERSION + +WORKDIR /app/tools/ui + +COPY tools/ui/package.json tools/ui/package-lock.json ./ +RUN npm ci + +COPY tools/ui/ ./ +RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build + FROM ${BASE_CUDA_DEV_CONTAINER} AS build +ARG GCC_VERSION # CUDA architecture to build for (defaults to all supported archs) ARG CUDA_DOCKER_ARCH=default RUN apt-get update && \ - apt-get install -y gcc-14 g++-14 build-essential cmake python3 python3-pip git libssl-dev libgomp1 + apt-get install -y gcc-${GCC_VERSION} g++-${GCC_VERSION} build-essential cmake python3 python3-pip git libssl-dev libgomp1 -ENV CC=gcc-14 CXX=g++-14 CUDAHOSTCXX=g++-14 +ENV CC=gcc-${GCC_VERSION} CXX=g++-${GCC_VERSION} CUDAHOSTCXX=g++-${GCC_VERSION} WORKDIR /app COPY . . +COPY --from=web /app/tools/ui/dist tools/ui/dist + RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \ export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \ fi && \ @@ -95,7 +113,7 @@ ENTRYPOINT ["/app/tools.sh"] ### Light, CLI only FROM base AS light -COPY --from=build /app/full/llama-cli /app/full/llama-completion /app +COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app WORKDIR /app @@ -106,7 +124,7 @@ FROM base AS server ENV LLAMA_ARG_HOST=0.0.0.0 -COPY --from=build /app/full/llama-server /app +COPY --from=build /app/full/llama /app/full/llama-server /app WORKDIR /app diff --git a/.devops/intel.Dockerfile b/.devops/intel.Dockerfile index 93fd4fa5a30c..b4bcd94b9264 100644 --- a/.devops/intel.Dockerfile +++ b/.devops/intel.Dockerfile @@ -5,9 +5,23 @@ ARG APP_REVISION=N/A ## Build Image -FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS build +ARG NODE_VERSION=24 -ARG GGML_SYCL_F16=OFF +FROM docker.io/node:$NODE_VERSION AS web + +ARG APP_VERSION + +WORKDIR /app/tools/ui + +COPY tools/ui/package.json tools/ui/package-lock.json ./ +RUN npm ci + +COPY tools/ui/ ./ +RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build + +FROM docker.io/intel/deep-learning-essentials:$ONEAPI_VERSION AS build + +ARG GGML_SYCL_F16=ON ARG LEVEL_ZERO_VERSION=1.28.2 ARG LEVEL_ZERO_UBUNTU_VERSION=u24.04 RUN apt-get update && \ @@ -22,9 +36,12 @@ WORKDIR /app COPY . . +COPY --from=web /app/tools/ui/dist tools/ui/dist + RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \ echo "GGML_SYCL_F16 is set" \ - && export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \ + && export OPT_SYCL_F16="-DGGML_SYCL_F16=ON" \ + && export SYCL_PROGRAM_COMPILE_OPTIONS="-cl-fp32-correctly-rounded-divide-sqrt"; \ fi && \ echo "Building with dynamic libs" && \ cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_BUILD_TESTS=OFF ${OPT_SYCL_F16} && \ @@ -42,7 +59,7 @@ RUN mkdir -p /app/full \ && cp requirements.txt /app/full \ && cp .devops/tools.sh /app/full/tools.sh -FROM intel/deep-learning-essentials:$ONEAPI_VERSION AS base +FROM docker.io/intel/deep-learning-essentials:$ONEAPI_VERSION AS base ARG BUILD_DATE=N/A ARG APP_VERSION=N/A @@ -124,7 +141,7 @@ ENTRYPOINT ["/app/tools.sh"] FROM base AS light COPY --from=build /app/lib/ /app -COPY --from=build /app/full/llama-cli /app/full/llama-completion /app +COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app WORKDIR /app @@ -136,7 +153,7 @@ FROM base AS server ENV LLAMA_ARG_HOST=0.0.0.0 COPY --from=build /app/lib/ /app -COPY --from=build /app/full/llama-server /app +COPY --from=build /app/full/llama /app/full/llama-server /app WORKDIR /app diff --git a/.devops/llama-cli-cann.Dockerfile b/.devops/llama-cli-cann.Dockerfile index 447d871ac4fb..096151f1afda 100644 --- a/.devops/llama-cli-cann.Dockerfile +++ b/.devops/llama-cli-cann.Dockerfile @@ -3,7 +3,7 @@ ARG BUILD_DATE=N/A ARG APP_VERSION=N/A ARG APP_REVISION=N/A -FROM ascendai/cann:$ASCEND_VERSION AS build +FROM docker.io/ascendai/cann:$ASCEND_VERSION AS build WORKDIR /app @@ -30,7 +30,7 @@ RUN echo "Building with static libs" && \ cmake --build build --config Release --target llama-completion # TODO: use image with NNRT -FROM ascendai/cann:$ASCEND_VERSION AS runtime +FROM docker.io/ascendai/cann:$ASCEND_VERSION AS runtime ARG BUILD_DATE=N/A ARG APP_VERSION=N/A diff --git a/.devops/musa.Dockerfile b/.devops/musa.Dockerfile index ddc29b278627..d30a70bb364c 100644 --- a/.devops/musa.Dockerfile +++ b/.devops/musa.Dockerfile @@ -2,14 +2,28 @@ ARG UBUNTU_VERSION=22.04 # This needs to generally match the container host's environment. ARG MUSA_VERSION=rc4.3.0 # Target the MUSA build image -ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64 +ARG BASE_MUSA_DEV_CONTAINER=docker.io/mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64 -ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64 +ARG BASE_MUSA_RUN_CONTAINER=docker.io/mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64 ARG BUILD_DATE=N/A ARG APP_VERSION=N/A ARG APP_REVISION=N/A +ARG NODE_VERSION=24 + +FROM docker.io/node:$NODE_VERSION AS web + +ARG APP_VERSION + +WORKDIR /app/tools/ui + +COPY tools/ui/package.json tools/ui/package-lock.json ./ +RUN npm ci + +COPY tools/ui/ ./ +RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build + FROM ${BASE_MUSA_DEV_CONTAINER} AS build # MUSA architecture to build for (defaults to all supported archs) @@ -29,6 +43,8 @@ WORKDIR /app COPY . . +COPY --from=web /app/tools/ui/dist tools/ui/dist + RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \ export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \ fi && \ @@ -99,7 +115,7 @@ ENTRYPOINT ["/app/tools.sh"] ### Light, CLI only FROM base AS light -COPY --from=build /app/full/llama-cli /app/full/llama-completion /app +COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app WORKDIR /app @@ -110,7 +126,7 @@ FROM base AS server ENV LLAMA_ARG_HOST=0.0.0.0 -COPY --from=build /app/full/llama-server /app +COPY --from=build /app/full/llama /app/full/llama-server /app WORKDIR /app diff --git a/.devops/openvino.Dockerfile b/.devops/openvino.Dockerfile index ab14288ce171..9b2784b664e9 100644 --- a/.devops/openvino.Dockerfile +++ b/.devops/openvino.Dockerfile @@ -1,17 +1,17 @@ -ARG OPENVINO_VERSION_MAJOR=2026.0 -ARG OPENVINO_VERSION_FULL=2026.0.0.20965.c6d6a13a886 +ARG OPENVINO_VERSION_MAJOR=2026.2.1 +ARG OPENVINO_VERSION_FULL=2026.2.1.21919.ede283a88e3 ARG UBUNTU_VERSION=24.04 # Intel GPU driver versions. https://github.com/intel/compute-runtime/releases -ARG IGC_VERSION=v2.30.1 -ARG IGC_VERSION_FULL=2_2.30.1+20950 -ARG COMPUTE_RUNTIME_VERSION=26.09.37435.1 -ARG COMPUTE_RUNTIME_VERSION_FULL=26.09.37435.1-0 -ARG IGDGMM_VERSION=22.9.0 +ARG IGC_VERSION=v2.36.3 +ARG IGC_VERSION_FULL=2_2.36.3+21719 +ARG COMPUTE_RUNTIME_VERSION=26.22.38646.4 +ARG COMPUTE_RUNTIME_VERSION_FULL=26.22.38646.4-0 +ARG IGDGMM_VERSION=22.10.0 # Intel NPU driver versions. https://github.com/intel/linux-npu-driver/releases -ARG NPU_DRIVER_VERSION=v1.32.0 -ARG NPU_DRIVER_FULL=v1.32.0.20260402-23905121947 +ARG NPU_DRIVER_VERSION=v1.33.0 +ARG NPU_DRIVER_FULL=v1.33.0.20260529-26625960453 ARG LIBZE1_VERSION=1.27.0-1~24.04~ppa2 # Optional proxy build arguments @@ -22,8 +22,22 @@ ARG BUILD_DATE=N/A ARG APP_VERSION=N/A ARG APP_REVISION=N/A +ARG NODE_VERSION=24 + +FROM docker.io/node:$NODE_VERSION AS web + +ARG APP_VERSION + +WORKDIR /app/tools/ui + +COPY tools/ui/package.json tools/ui/package-lock.json ./ +RUN npm ci + +COPY tools/ui/ ./ +RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build + ## Build Image -FROM ubuntu:${UBUNTU_VERSION} AS build +FROM docker.io/ubuntu:${UBUNTU_VERSION} AS build # Pass proxy args to build stage ARG http_proxy @@ -46,13 +60,18 @@ RUN apt-get update && \ intel-opencl-icd && \ rm -rf /var/lib/apt/lists/* -# Install OpenVINO for Ubuntu 24.04 +# OpenVINO toolkit and GPU/NPU drivers are cached via BuildKit cache mounts to avoid re-downloading on rebuilds. +# Install OpenVINO for Ubuntu 24.04. ARG OPENVINO_VERSION_MAJOR ARG OPENVINO_VERSION_FULL -RUN mkdir -p /opt/intel && \ - wget https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION_MAJOR}/linux/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz && \ - tar -xf openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz && \ - mv openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64 /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} && \ +RUN --mount=type=cache,target=/var/cache/openvino,sharing=locked \ + mkdir -p /opt/intel && \ + TGZ=/var/cache/openvino/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz && \ + if [ ! -f "$TGZ" ]; then \ + wget -O "$TGZ" https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION_MAJOR}/linux/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz; \ + fi && \ + tar -xf "$TGZ" -C /opt/intel/ && \ + mv /opt/intel/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64 /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} && \ cd /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} && \ echo "Y" | ./install_dependencies/install_openvino_dependencies.sh && \ cd - && \ @@ -64,18 +83,20 @@ WORKDIR /app COPY . . +COPY --from=web /app/tools/ui/dist tools/ui/dist + # Build Stage RUN bash -c "source ${OpenVINO_DIR}/setupvars.sh && \ cmake -B build/ReleaseOV -G Ninja \ -DCMAKE_BUILD_TYPE=Release \ + -DLLAMA_BUILD_TESTS=OFF \ -DGGML_OPENVINO=ON && \ - cmake --build build/ReleaseOV -j$(nproc)" + cmake --build build/ReleaseOV --parallel " -# Copy all necessary libraries +# Copy all necessary libraries (build outputs + OpenVINO runtime libs) RUN mkdir -p /app/lib && \ - find build/ReleaseOV -name '*.so*' -exec cp {} /app/lib \; && \ - find ${OpenVINO_DIR}/runtime/lib/intel64 -name '*.so*' -exec cp -P {} /app/lib \; 2>/dev/null || \ - find ${OpenVINO_DIR}/lib/intel64 -name '*.so*' -exec cp -P {} /app/lib \; + find build/ReleaseOV -name '*.so*' -exec cp -P {} /app/lib \; && \ + find "${OpenVINO_DIR}/runtime/lib/intel64" -name '*.so*' -exec cp -P {} /app/lib \; # Create runtime directories and copy binaries RUN mkdir -p /app/full \ @@ -88,7 +109,7 @@ RUN mkdir -p /app/full \ && cp .devops/tools.sh /app/full/tools.sh ## Base Runtime Image -FROM ubuntu:${UBUNTU_VERSION} AS base +FROM docker.io/ubuntu:${UBUNTU_VERSION} AS base # Pass proxy args to runtime stage ARG http_proxy @@ -120,33 +141,41 @@ ARG IGC_VERSION_FULL ARG COMPUTE_RUNTIME_VERSION ARG COMPUTE_RUNTIME_VERSION_FULL ARG IGDGMM_VERSION -RUN mkdir /tmp/neo/ && cd /tmp/neo/ \ - && wget https://github.com/intel/intel-graphics-compiler/releases/download/${IGC_VERSION}/intel-igc-core-${IGC_VERSION_FULL}_amd64.deb \ - && wget https://github.com/intel/intel-graphics-compiler/releases/download/${IGC_VERSION}/intel-igc-opencl-${IGC_VERSION_FULL}_amd64.deb \ - && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-ocloc-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \ - && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-ocloc_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \ - && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-opencl-icd-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \ - && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-opencl-icd_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \ - && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libigdgmm12_${IGDGMM_VERSION}_amd64.deb \ - && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libze-intel-gpu1-dbgsym_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.ddeb \ - && wget https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libze-intel-gpu1_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \ - && dpkg --install *.deb \ - && rm -rf /tmp/neo/ +RUN --mount=type=cache,target=/var/cache/intel-gpu,sharing=locked \ + set -eux; \ + cd /var/cache/intel-gpu; \ + for url in \ + https://github.com/intel/intel-graphics-compiler/releases/download/${IGC_VERSION}/intel-igc-core-${IGC_VERSION_FULL}_amd64.deb \ + https://github.com/intel/intel-graphics-compiler/releases/download/${IGC_VERSION}/intel-igc-opencl-${IGC_VERSION_FULL}_amd64.deb \ + https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-ocloc_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \ + https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/intel-opencl-icd_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb \ + https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libigdgmm12_${IGDGMM_VERSION}_amd64.deb \ + https://github.com/intel/compute-runtime/releases/download/${COMPUTE_RUNTIME_VERSION}/libze-intel-gpu1_${COMPUTE_RUNTIME_VERSION_FULL}_amd64.deb ; do \ + f=$(basename "$url"); \ + [ -f "$f" ] || wget -q -O "$f" "$url"; \ + done; \ + apt-get update; \ + apt-get install -y --no-install-recommends ./*.deb; \ + rm -rf /var/lib/apt/lists/* # Install NPU drivers ARG NPU_DRIVER_VERSION ARG NPU_DRIVER_FULL ARG LIBZE1_VERSION -RUN mkdir /tmp/npu/ && cd /tmp/npu/ \ - && wget https://github.com/intel/linux-npu-driver/releases/download/${NPU_DRIVER_VERSION}/linux-npu-driver-${NPU_DRIVER_FULL}-ubuntu2404.tar.gz \ - && tar -xf linux-npu-driver-${NPU_DRIVER_FULL}-ubuntu2404.tar.gz \ - && dpkg --install *.deb \ - && rm -rf /tmp/npu/ - -RUN cd /tmp \ - && wget https://snapshot.ppa.launchpadcontent.net/kobuk-team/intel-graphics/ubuntu/20260324T100000Z/pool/main/l/level-zero-loader/libze1_${LIBZE1_VERSION}_amd64.deb \ - && dpkg --install libze1_${LIBZE1_VERSION}_amd64.deb \ - && rm libze1_${LIBZE1_VERSION}_amd64.deb +RUN --mount=type=cache,target=/var/cache/intel-npu,sharing=locked \ + set -eux; \ + TGZ=/var/cache/intel-npu/linux-npu-driver-${NPU_DRIVER_FULL}-ubuntu2404.tar.gz; \ + if [ ! -f "$TGZ" ]; then \ + wget -q -O "$TGZ" https://github.com/intel/linux-npu-driver/releases/download/${NPU_DRIVER_VERSION}/linux-npu-driver-${NPU_DRIVER_FULL}-ubuntu2404.tar.gz; \ + fi; \ + DEB=/var/cache/intel-npu/libze1_${LIBZE1_VERSION}_amd64.deb; \ + if [ ! -f "$DEB" ]; then \ + wget -q -O "$DEB" https://snapshot.ppa.launchpadcontent.net/kobuk-team/intel-graphics/ubuntu/20260324T100000Z/pool/main/l/level-zero-loader/libze1_${LIBZE1_VERSION}_amd64.deb; \ + fi; \ + mkdir /tmp/npu/ && cd /tmp/npu/ && tar -xf "$TGZ" && cp "$DEB" .; \ + apt-get update; \ + apt-get install -y --no-install-recommends ./*.deb; \ + rm -rf /tmp/npu/ /var/lib/apt/lists/* COPY --from=build /app/lib/ /app/ @@ -166,22 +195,26 @@ RUN apt-get update && \ python3 \ python3-venv \ python3-pip && \ - python3 -m venv /ov-venv && \ - /ov-venv/bin/pip install --no-cache-dir --upgrade pip setuptools wheel && \ - /ov-venv/bin/pip install --no-cache-dir -r requirements.txt && \ + python3 -m venv /openvino-venv && \ + /openvino-venv/bin/pip install --no-cache-dir --upgrade pip setuptools wheel && \ + /openvino-venv/bin/pip install --no-cache-dir -r requirements.txt && \ apt-get autoremove -y && \ apt-get clean && \ rm -rf /tmp/* /var/tmp/* && \ find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete && \ find /var/cache -type f -delete -ENTRYPOINT ["/bin/bash", "-c", "source /ov-venv/bin/activate && exec /app/tools.sh \"$@\"", "--"] +# Activate the venv +ENV VIRTUAL_ENV=/openvino-venv \ + PATH=/openvino-venv/bin:$PATH + +ENTRYPOINT ["/app/tools.sh"] ### Light, CLI only FROM base AS light -COPY --from=build /app/full/llama-cli /app/ +COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app/ WORKDIR /app @@ -192,7 +225,7 @@ FROM base AS server ENV LLAMA_ARG_HOST=0.0.0.0 -COPY --from=build /app/full/llama-server /app/ +COPY --from=build /app/full/llama /app/full/llama-server /app/ WORKDIR /app diff --git a/.devops/rocm.Dockerfile b/.devops/rocm.Dockerfile index 2209ab661f24..a8bc4e1fcd64 100644 --- a/.devops/rocm.Dockerfile +++ b/.devops/rocm.Dockerfile @@ -5,12 +5,26 @@ ARG ROCM_VERSION=7.2.1 ARG AMDGPU_VERSION=7.2.1 # Target the ROCm build image -ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete +ARG BASE_ROCM_DEV_CONTAINER=docker.io/rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete ARG BUILD_DATE=N/A ARG APP_VERSION=N/A ARG APP_REVISION=N/A +ARG NODE_VERSION=24 + +FROM docker.io/node:$NODE_VERSION AS web + +ARG APP_VERSION + +WORKDIR /app/tools/ui + +COPY tools/ui/package.json tools/ui/package-lock.json ./ +RUN npm ci + +COPY tools/ui/ ./ +RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build + ### Build image FROM ${BASE_ROCM_DEV_CONTAINER} AS build @@ -38,6 +52,8 @@ WORKDIR /app COPY . . +COPY --from=web /app/tools/ui/dist tools/ui/dist + RUN HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \ cmake -S . -B build \ -DGGML_HIP=ON \ @@ -111,7 +127,7 @@ ENTRYPOINT ["/app/tools.sh"] ### Light, CLI only FROM base AS light -COPY --from=build /app/full/llama-cli /app/full/llama-completion /app +COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app WORKDIR /app @@ -122,7 +138,7 @@ FROM base AS server ENV LLAMA_ARG_HOST=0.0.0.0 -COPY --from=build /app/full/llama-server /app +COPY --from=build /app/full/llama /app/full/llama-server /app WORKDIR /app diff --git a/.devops/s390x.Dockerfile b/.devops/s390x.Dockerfile index 31c2fa902d4a..94a715ff2d48 100644 --- a/.devops/s390x.Dockerfile +++ b/.devops/s390x.Dockerfile @@ -5,7 +5,7 @@ ARG APP_VERSION=N/A ARG APP_REVISION=N/A ### Build Llama.cpp stage -FROM gcc:${GCC_VERSION} AS build +FROM docker.io/gcc:${GCC_VERSION} AS build RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ --mount=type=cache,target=/var/lib/apt/lists,sharing=locked \ @@ -55,7 +55,7 @@ COPY --from=build /opt/llama.cpp/conversion /llama.cpp/conversion ### Base image -FROM ubuntu:${UBUNTU_VERSION} AS base +FROM docker.io/ubuntu:${UBUNTU_VERSION} AS base ARG BUILD_DATE=N/A ARG APP_VERSION=N/A @@ -124,7 +124,7 @@ WORKDIR /llama.cpp/bin # Copy llama.cpp binaries and libraries COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin -COPY --from=collector /llama.cpp/bin/llama-cli /llama.cpp/bin/llama-completion /llama.cpp/bin +COPY --from=collector /llama.cpp/bin/llama /llama.cpp/bin/llama-cli /llama.cpp/bin/llama-completion /llama.cpp/bin ENTRYPOINT [ "/llama.cpp/bin/llama-cli" ] @@ -138,7 +138,7 @@ WORKDIR /llama.cpp/bin # Copy llama.cpp binaries and libraries COPY --from=collector /llama.cpp/bin/*.so /llama.cpp/bin -COPY --from=collector /llama.cpp/bin/llama-server /llama.cpp/bin +COPY --from=collector /llama.cpp/bin/llama /llama.cpp/bin/llama-server /llama.cpp/bin EXPOSE 8080 diff --git a/.devops/vulkan.Dockerfile b/.devops/vulkan.Dockerfile index f26c7c45b8a2..d3599ffb82c0 100644 --- a/.devops/vulkan.Dockerfile +++ b/.devops/vulkan.Dockerfile @@ -3,7 +3,21 @@ ARG BUILD_DATE=N/A ARG APP_VERSION=N/A ARG APP_REVISION=N/A -FROM ubuntu:$UBUNTU_VERSION AS build +ARG NODE_VERSION=24 + +FROM docker.io/node:$NODE_VERSION AS web + +ARG APP_VERSION + +WORKDIR /app/tools/ui + +COPY tools/ui/package.json tools/ui/package-lock.json ./ +RUN npm ci + +COPY tools/ui/ ./ +RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build + +FROM docker.io/ubuntu:$UBUNTU_VERSION AS build # Install build tools RUN apt update && apt install -y git build-essential cmake wget xz-utils @@ -17,6 +31,8 @@ WORKDIR /app COPY . . +COPY --from=web /app/tools/ui/dist tools/ui/dist + RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=ON -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON && \ cmake --build build --config Release -j$(nproc) @@ -33,7 +49,7 @@ RUN mkdir -p /app/full \ && cp .devops/tools.sh /app/full/tools.sh ## Base image -FROM ubuntu:$UBUNTU_VERSION AS base +FROM docker.io/ubuntu:$UBUNTU_VERSION AS base ARG BUILD_DATE=N/A ARG APP_VERSION=N/A @@ -91,7 +107,7 @@ ENTRYPOINT ["/app/tools.sh"] ### Light, CLI only FROM base AS light -COPY --from=build /app/full/llama-cli /app/full/llama-completion /app +COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app WORKDIR /app @@ -102,7 +118,7 @@ FROM base AS server ENV LLAMA_ARG_HOST=0.0.0.0 -COPY --from=build /app/full/llama-server /app +COPY --from=build /app/full/llama /app/full/llama-server /app WORKDIR /app diff --git a/.devops/zendnn.Dockerfile b/.devops/zendnn.Dockerfile index c25d7e4d3da7..8a50b3ef6a13 100644 --- a/.devops/zendnn.Dockerfile +++ b/.devops/zendnn.Dockerfile @@ -3,7 +3,21 @@ ARG BUILD_DATE=N/A ARG APP_VERSION=N/A ARG APP_REVISION=N/A -FROM ubuntu:$UBUNTU_VERSION AS build +ARG NODE_VERSION=24 + +FROM docker.io/node:$NODE_VERSION AS web + +ARG APP_VERSION + +WORKDIR /app/tools/ui + +COPY tools/ui/package.json tools/ui/package-lock.json ./ +RUN npm ci + +COPY tools/ui/ ./ +RUN LLAMA_BUILD_NUMBER="$APP_VERSION" npm run build + +FROM docker.io/ubuntu:$UBUNTU_VERSION AS build RUN apt-get update && \ apt-get install -y gcc-13 g++-13 build-essential git cmake libssl-dev libomp-dev libnuma-dev python3 ca-certificates @@ -14,6 +28,8 @@ WORKDIR /app COPY . . +COPY --from=web /app/tools/ui/dist tools/ui/dist + RUN cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_TESTS=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_ZENDNN=ON && \ cmake --build build -j $(nproc) @@ -30,7 +46,7 @@ RUN mkdir -p /app/full \ && cp .devops/tools.sh /app/full/tools.sh ## Base image -FROM ubuntu:$UBUNTU_VERSION AS base +FROM docker.io/ubuntu:$UBUNTU_VERSION AS base ARG BUILD_DATE=N/A ARG APP_VERSION=N/A @@ -81,7 +97,7 @@ ENTRYPOINT ["/app/tools.sh"] ### Light, CLI only FROM base AS light -COPY --from=build /app/full/llama-cli /app/full/llama-completion /app +COPY --from=build /app/full/llama /app/full/llama-cli /app/full/llama-completion /app WORKDIR /app @@ -92,7 +108,7 @@ FROM base AS server ENV LLAMA_ARG_HOST=0.0.0.0 -COPY --from=build /app/full/llama-server /app +COPY --from=build /app/full/llama /app/full/llama-server /app WORKDIR /app diff --git a/.dockerignore b/.dockerignore index 064b7c7be86d..0b81e83bf5d4 100644 --- a/.dockerignore +++ b/.dockerignore @@ -10,6 +10,8 @@ build*/ +tools/ui/node_modules/ + models/* /llama-cli diff --git a/.github/actions/windows-setup-openvino/action.yml b/.github/actions/windows-setup-openvino/action.yml new file mode 100644 index 000000000000..f983df56025b --- /dev/null +++ b/.github/actions/windows-setup-openvino/action.yml @@ -0,0 +1,24 @@ +name: "Windows - Setup OpenVINO Toolkit" +description: "Setup OpenVINO Toolkit for Windows" +inputs: + path: + description: "Installation path" + required: true + version_major: + description: "OpenVINO major version (e.g., 2026.2)" + required: true + version_full: + description: "OpenVINO full version" + required: true + +runs: + using: "composite" + steps: + - name: Download and extract OpenVINO Runtime + shell: powershell + run: | + $url = "https://storage.openvinotoolkit.org/repositories/openvino/packages/${{ inputs.version_major }}/windows/openvino_toolkit_windows_${{ inputs.version_full }}_x86_64.zip" + $out = "openvino.zip" + Invoke-WebRequest -Uri $url -OutFile $out + Expand-Archive -Path $out -DestinationPath ${{ inputs.path }} -Force + Remove-Item $out diff --git a/.github/labeler.yml b/.github/labeler.yml index 60aa51d2cc23..20e19c35234e 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -12,7 +12,7 @@ SYCL: - ggml/src/ggml-sycl/** - docs/backend/SYCL.md - examples/sycl/** -Nvidia GPU: +CUDA: - changed-files: - any-glob-to-any-file: - ggml/include/ggml-cuda.h @@ -35,8 +35,20 @@ AMD ZenDNN: documentation: - changed-files: - any-glob-to-any-file: + - "**/*.md" - docs/** - media/** +examples: + - all: + - changed-files: + - any-glob-to-any-file: + - app/** + - examples/** + - tools/** + - all-globs-to-all-files: + - '!tools/server/**' + - '!tools/mtmd/**' + - '!tools/ui/**' testing: - changed-files: - any-glob-to-any-file: @@ -47,28 +59,12 @@ build: - cmake/** - CMakeLists.txt - CMakePresets.json -examples: - - changed-files: - - any-glob-to-any-file: - - examples/** - - tools/** devops: - changed-files: - any-glob-to-any-file: - .devops/** - .github/** - ci/** -python: - - changed-files: - - any-glob-to-any-file: - - "**/*.py" - - requirements/** - - gguf-py/** - - .flake8 -script: - - changed-files: - - any-glob-to-any-file: - - scripts/** android: - changed-files: - any-glob-to-any-file: @@ -81,9 +77,20 @@ server: - changed-files: - any-glob-to-any-file: - tools/server/** - - - +mtmd: + - changed-files: + - any-glob-to-any-file: + - tools/mtmd/** +conversion: + - changed-files: + - any-glob-to-any-file: + - conversion/** + - convert_*.py + - gguf-py/** +vendor: + - changed-files: + - any-glob-to-any-file: + - vendor/** ggml: - changed-files: - any-glob-to-any-file: diff --git a/.github/workflows/build-cache.yml b/.github/workflows/build-cache.yml index 53d65f3768b4..327f71978bf1 100644 --- a/.github/workflows/build-cache.yml +++ b/.github/workflows/build-cache.yml @@ -68,8 +68,8 @@ jobs: env: # Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile - OPENVINO_VERSION_MAJOR: "2026.0" - OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886" + OPENVINO_VERSION_MAJOR: "2026.2.1" + OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3" steps: - name: Clone @@ -91,6 +91,34 @@ jobs: version_major: ${{ env.OPENVINO_VERSION_MAJOR }} version_full: ${{ env.OPENVINO_VERSION_FULL }} + windows-2022-openvino-cache: + runs-on: windows-2022 + + env: + # Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile + OPENVINO_VERSION_MAJOR: "2026.2.1" + OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3" + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v6 + + - name: Setup Cache + uses: actions/cache@v5 + id: cache-openvino + with: + path: ./openvino_toolkit + key: cache-gha-openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }} + + - name: Setup OpenVINO Toolkit + if: steps.cache-openvino.outputs.cache-hit != 'true' + uses: ./.github/actions/windows-setup-openvino + with: + path: ./openvino_toolkit + version_major: ${{ env.OPENVINO_VERSION_MAJOR }} + version_full: ${{ env.OPENVINO_VERSION_FULL }} + windows-2022-rocm-cache: runs-on: windows-2022 diff --git a/.github/workflows/build-openvino.yml b/.github/workflows/build-openvino.yml index ddcbc6697455..938cde3f20ff 100644 --- a/.github/workflows/build-openvino.yml +++ b/.github/workflows/build-openvino.yml @@ -37,14 +37,10 @@ jobs: ubuntu-24-openvino: runs-on: [self-hosted, Linux, Intel, OpenVINO] - concurrency: - group: openvino-gpu-${{ github.head_ref || github.ref }} - cancel-in-progress: false - env: # Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile - OPENVINO_VERSION_MAJOR: "2026.0" - OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886" + OPENVINO_VERSION_MAJOR: "2026.2.1" + OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3" steps: - name: Clone @@ -78,7 +74,7 @@ jobs: cmake -B build/ReleaseOV -G Ninja \ -DCMAKE_BUILD_TYPE=Release \ -DGGML_OPENVINO=ON - time cmake --build build/ReleaseOV --config Release -j $(nproc) + time cmake --build build/ReleaseOV --config Release --parallel - name: Test (CPU) id: cmake_test_cpu @@ -93,4 +89,81 @@ jobs: run: | cd ${{ github.workspace }} export GGML_OPENVINO_DEVICE=GPU - ctest --test-dir build/ReleaseOV -L main -E "test-llama-archs" --verbose --timeout 2000 + ctest --test-dir build/ReleaseOV -L main -E "test-llama-archs" --verbose --timeout 3000 + + openvino-windows-2022: + runs-on: windows-2022 + + env: + # Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile + OPENVINO_VERSION_MAJOR: "2026.2.1" + OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3" + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v6 + + - name: ccache + uses: ggml-org/ccache-action@v1.2.21 + with: + key: openvino-windows-2022 + variant: ccache + evict-old-files: 1d + save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} + + - name: Setup Cache + uses: actions/cache@v5 + id: cache-openvino + with: + path: ./openvino_toolkit + key: cache-gha-openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }} + + - name: Setup OpenVINO Toolkit + if: steps.cache-openvino.outputs.cache-hit != 'true' + uses: ./.github/actions/windows-setup-openvino + with: + path: ./openvino_toolkit + version_major: ${{ env.OPENVINO_VERSION_MAJOR }} + version_full: ${{ env.OPENVINO_VERSION_FULL }} + + - name: Install OpenCL using vcpkg + shell: powershell + run: | + git clone https://github.com/microsoft/vcpkg C:\vcpkg + C:\vcpkg\bootstrap-vcpkg.bat + C:\vcpkg\vcpkg install opencl + + - name: Build + id: cmake_build + shell: cmd + run: | + REM Find extracted OpenVINO folder dynamically + for /d %%i in (openvino_toolkit\*) do set OPENVINO_ROOT=%%i + + if not exist "%OPENVINO_ROOT%\runtime\cmake\OpenVINOConfig.cmake" ( + echo ERROR: OpenVINOConfig.cmake not found + exit /b 1 + ) + + call "%OPENVINO_ROOT%\setupvars.bat" + + cmake -B build\ReleaseOV -G "Visual Studio 17 2022" ^ + -A x64 ^ + -DCMAKE_BUILD_TYPE=Release ^ + -DGGML_OPENVINO=ON ^ + -DCMAKE_TOOLCHAIN_FILE=C:\vcpkg\scripts\buildsystems\vcpkg.cmake + + cmake --build build\ReleaseOV --config Release -- /m + + - name: Test (CPU) + id: cmake_test_cpu + shell: cmd + # TODO: fix and re-enable the `test-llama-archs` test below + run: | + REM Find extracted OpenVINO folder dynamically + for /d %%i in (openvino_toolkit\*) do set OPENVINO_ROOT=%%i + call "%OPENVINO_ROOT%\setupvars.bat" + + cd build + ctest --test-dir ReleaseOV -L main -E "test-llama-archs" -C Release --verbose --timeout 3000 diff --git a/.github/workflows/build-self-hosted.yml b/.github/workflows/build-self-hosted.yml index 436100c8a4cd..1a71ed827729 100644 --- a/.github/workflows/build-self-hosted.yml +++ b/.github/workflows/build-self-hosted.yml @@ -264,14 +264,10 @@ jobs: gpu-openvino-low-perf: runs-on: [self-hosted, Linux, Intel, OpenVINO] - concurrency: - group: openvino-gpu-${{ github.head_ref || github.ref }} - cancel-in-progress: false - env: # Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile - OPENVINO_VERSION_MAJOR: "2026.0" - OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886" + OPENVINO_VERSION_MAJOR: "2026.2.1" + OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3" steps: - name: Clone diff --git a/.github/workflows/build-sycl.yml b/.github/workflows/build-sycl.yml index ef377c8186f6..deb0e5479a57 100644 --- a/.github/workflows/build-sycl.yml +++ b/.github/workflows/build-sycl.yml @@ -34,129 +34,108 @@ env: LLAMA_ARG_LOG_TIMESTAMPS: 1 jobs: + ubuntu-24-sycl: + strategy: + matrix: + build: [fp32, fp16] + include: + - build: fp32 + fp16: OFF + - build: fp16 + fp16: ON -# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705) -# in order to enable it again, we have to provision dedicated runners to run it -# ubuntu-24-sycl: -# strategy: -# matrix: -# build: [fp32] -# include: -# - build: fp32 -# fp16: OFF -# -# runs-on: ubuntu-24.04 -# -# env: -# ONEAPI_ROOT: /opt/intel/oneapi/ -# ONEAPI_INSTALLER_VERSION: "2025.3.3" -# LEVEL_ZERO_VERSION: "1.28.2" -# LEVEL_ZERO_UBUNTU_VERSION: "u24.04" -# -# continue-on-error: true -# -# steps: -# - uses: actions/checkout@v6 -# -# - name: Use oneAPI Installation Cache -# uses: actions/cache@v5 -# id: cache-sycl -# with: -# path: ${{ env.ONEAPI_ROOT }} -# key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }} -# -# - name: Download & Install oneAPI -# shell: bash -# if: steps.cache-sycl.outputs.cache-hit != 'true' -# run: | -# cd /tmp -# wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh -# sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept -# -# - name: Install Level Zero SDK -# shell: bash -# run: | -# cd /tmp -# wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb -# wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb -# sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb -# -# - name: Clone -# id: checkout -# uses: actions/checkout@v6 -# -# - name: ccache -# uses: ggml-org/ccache-action@v1.2.21 -# with: -# key: sycl-ubuntu-24-${{ matrix.build }} -# evict-old-files: 1d -# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} -# -# - name: Build -# id: cmake_build -# run: | -# source /opt/intel/oneapi/setvars.sh -# cmake -B build \ -# -G "Ninja" \ -# -DCMAKE_BUILD_TYPE=Release \ -# -DGGML_SYCL=ON \ -# -DCMAKE_C_COMPILER=icx \ -# -DCMAKE_CXX_COMPILER=icpx \ -# -DLLAMA_OPENSSL=OFF \ -# -DGGML_NATIVE=OFF \ -# -DGGML_SYCL_F16=${{ matrix.fp16 }} -# time cmake --build build --config Release -j $(nproc) - -# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705) -# in order to enable it again, we have to provision dedicated runners to run it -# windows-latest-sycl: -# runs-on: windows-2022 -# -# defaults: -# run: -# shell: bash -# -# env: -# WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe -# WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel -# LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip -# ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI" -# ONEAPI_INSTALLER_VERSION: "2025.3.3" -# steps: -# - name: Clone -# id: checkout -# uses: actions/checkout@v6 -# -# - name: Use oneAPI Installation Cache -# uses: actions/cache@v5 -# id: cache-sycl -# with: -# path: ${{ env.ONEAPI_ROOT }} -# key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }} -# -# - name: Download & Install oneAPI -# shell: bash -# if: steps.cache-sycl.outputs.cache-hit != 'true' -# run: | -# scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL -# -# - name: Install Level Zero SDK -# shell: pwsh -# run: | -# Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip" -# Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force -# "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append -# -# - name: ccache -# uses: ggml-org/ccache-action@v1.2.21 -# with: -# key: sycl-windows-latest -# variant: ccache -# evict-old-files: 1d -# save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} -# -# # TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args -# -# - name: Build -# id: cmake_build -# run: examples/sycl/win-build-sycl.bat + runs-on: ubuntu-24.04 + + env: + ONEAPI_ROOT: /opt/intel/oneapi/ + ONEAPI_INSTALLER_VERSION: "2025.3.3" + LEVEL_ZERO_VERSION: "1.28.2" + LEVEL_ZERO_UBUNTU_VERSION: "u24.04" + + continue-on-error: true + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v6 + + - name: Download & Install oneAPI + shell: bash + run: | + cd /tmp + wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh + sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept + + - name: Install Level Zero SDK + shell: bash + run: | + cd /tmp + wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb + wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb + sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb + + - name: ccache + uses: ggml-org/ccache-action@v1.2.21 + with: + key: sycl-ubuntu-24-${{ matrix.build }} + evict-old-files: 1d + save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} + + - name: Build + id: cmake_build + run: | + source /opt/intel/oneapi/setvars.sh + cmake -B build \ + -G "Ninja" \ + -DCMAKE_BUILD_TYPE=Release \ + -DGGML_SYCL=ON \ + -DCMAKE_C_COMPILER=icx \ + -DCMAKE_CXX_COMPILER=icpx \ + -DLLAMA_OPENSSL=OFF \ + -DGGML_NATIVE=OFF \ + -DGGML_SYCL_F16=${{ matrix.fp16 }} + time cmake --build build --config Release -j $(nproc) + + windows-latest-sycl: + runs-on: windows-2022 + + defaults: + run: + shell: bash + + env: + WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe + WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel + LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip + ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI" + ONEAPI_INSTALLER_VERSION: "2025.3.3" + steps: + - name: Clone + id: checkout + uses: actions/checkout@v6 + + - name: Download & Install oneAPI + shell: bash + run: | + scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL + + - name: Install Level Zero SDK + shell: pwsh + run: | + Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip" + Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force + "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append + + - name: ccache + uses: ggml-org/ccache-action@v1.2.21 + with: + key: sycl-windows-latest + variant: ccache + evict-old-files: 1d + save: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} + + # TODO: add ssl support ; we will also need to modify win-build-sycl.bat to accept user-specified args + + - name: Build + id: cmake_build + run: examples/sycl/win-build-sycl.bat diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 8195a55ff28a..afe4b7c66410 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -58,6 +58,13 @@ jobs: git tag ${{ steps.srctag.outputs.name }} || exit 0 git push origin ${{ steps.srctag.outputs.name }} || exit 0 + build_ui: + name: Build UI + needs: create_tag + uses: ./.github/workflows/ui-build.yml + with: + hf_ui_version: ${{ needs.create_tag.outputs.source_tag }} + prepare_matrices: name: Prepare Docker matrices runs-on: ubuntu-24.04 @@ -79,7 +86,7 @@ jobs: [ { "tag": "cpu", "dockerfile": ".devops/cpu.Dockerfile", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04" }, { "tag": "cpu", "dockerfile": ".devops/cpu.Dockerfile", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-arm" }, - { "tag": "cpu", "dockerfile": ".devops/s390x.Dockerfile", "platforms": "linux/s390x", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-s390x" }, + { "tag": "cpu", "dockerfile": ".devops/s390x.Dockerfile", "platforms": "linux/s390x", "full": true, "light": true, "server": true, "free_disk_space": false, "runs_on": "ubuntu-24.04-s390x", "prebuilt_ui": true }, { "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "12.8.1", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" }, { "tag": "cuda cuda12", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "12.8.1", "platforms": "linux/arm64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04-arm" }, { "tag": "cuda13", "dockerfile": ".devops/cuda.Dockerfile", "cuda_version": "13.3.0", "platforms": "linux/amd64", "full": true, "light": true, "server": true, "free_disk_space": true, "runs_on": "ubuntu-24.04" }, @@ -135,7 +142,7 @@ jobs: push_to_registry: name: Push Docker image to Docker Registry - needs: [prepare_matrices, create_tag] + needs: [prepare_matrices, create_tag, build_ui] runs-on: ${{ matrix.config.runs_on }} strategy: @@ -150,6 +157,13 @@ jobs: fetch-depth: 0 ref: ${{ needs.create_tag.outputs.source_tag }} + - name: Download prebuilt UI + if: ${{ matrix.config.prebuilt_ui == true }} + uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8 + with: + name: ui-build + path: tools/ui/dist + - name: Set up QEMU if: ${{ contains(matrix.config.platforms, 'linux/amd64') }} uses: docker/setup-qemu-action@ce360397dd3f832beb865e1373c09c0e9f86d70a # v4 diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index b656900e5f89..616fca3daed1 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -46,11 +46,13 @@ jobs: steps: - id: check + env: + COMMIT_MESSAGE: ${{ github.event.head_commit.message }} run: | if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then echo "should_release=true" >> $GITHUB_OUTPUT elif [[ "${{ github.event_name }}" == "push" && "${{ github.ref }}" == "refs/heads/master" ]]; then - if echo "${{ github.event.head_commit.message }}" | grep -q '\[no release\]'; then + if echo "$COMMIT_MESSAGE" | grep -q '\[no release\]'; then echo "should_release=false" >> $GITHUB_OUTPUT else echo "should_release=true" >> $GITHUB_OUTPUT @@ -59,8 +61,31 @@ jobs: echo "should_release=false" >> $GITHUB_OUTPUT fi + get-version: + runs-on: ubuntu-slim + outputs: + ui_version: ${{ steps.version.outputs.ui_version }} + steps: + - uses: actions/checkout@v6 + with: + fetch-depth: 0 + - id: version + run: | + # Resolve UI version: BUILD_NUMBER from cmake/build-info.cmake > git hash + epoch > fallback + version="" + if grep -q "BUILD_NUMBER" cmake/build-info.cmake; then + build_number=$(grep "set(BUILD_NUMBER" cmake/build-info.cmake | grep -oP '\d+') + if [ -n "$build_number" ] && [ "$build_number" -gt 0 ]; then + version="b${build_number}" + fi + fi + if [ -z "$version" ]; then + version=$(git rev-parse --short HEAD)-$(date +%s) + fi + echo "ui_version=${version}" >> $GITHUB_OUTPUT + macos-cpu: - needs: [check-release] + needs: [check-release, get-version] if: ${{ needs.check-release.outputs.should_release == 'true' }} strategy: matrix: @@ -116,6 +141,7 @@ jobs: -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \ -DLLAMA_FATAL_WARNINGS=ON \ -DLLAMA_BUILD_BORINGSSL=ON \ + -DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }} \ ${{ env.CMAKE_ARGS }} cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) @@ -141,7 +167,7 @@ jobs: name: llama-bin-macos-${{ matrix.build }}.tar.gz ubuntu-cpu: - needs: [check-release] + needs: [check-release, get-version] if: ${{ needs.check-release.outputs.should_release == 'true' }} strategy: matrix: @@ -201,6 +227,7 @@ jobs: -DGGML_NATIVE=OFF \ -DGGML_CPU_ALL_VARIANTS=ON \ -DLLAMA_FATAL_WARNINGS=ON \ + -DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }} \ ${{ env.CMAKE_ARGS }} cmake --build build --config Release -j $(nproc) @@ -227,7 +254,7 @@ jobs: name: llama-bin-ubuntu-${{ matrix.build }}.tar.gz ubuntu-vulkan: - needs: [check-release] + needs: [check-release, get-version] if: ${{ needs.check-release.outputs.should_release == 'true' }} strategy: @@ -287,6 +314,7 @@ jobs: -DGGML_NATIVE=OFF \ -DGGML_CPU_ALL_VARIANTS=ON \ -DGGML_VULKAN=ON \ + -DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }} \ ${{ env.CMAKE_ARGS }} cmake --build build --config Release -j $(nproc) @@ -312,7 +340,7 @@ jobs: name: llama-bin-ubuntu-vulkan-${{ matrix.build }}.tar.gz android-arm64: - needs: [check-release] + needs: [check-release, get-version] if: ${{ needs.check-release.outputs.should_release == 'true' }} runs-on: ubuntu-latest @@ -379,6 +407,7 @@ jobs: -DLLAMA_FATAL_WARNINGS=ON \ -DGGML_OPENMP=OFF \ -DLLAMA_BUILD_BORINGSSL=ON \ + -DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }} \ ${{ env.CMAKE_ARGS }} cmake --build build --config Release -j $(nproc) @@ -404,7 +433,7 @@ jobs: name: llama-bin-android-arm64.tar.gz ubuntu-24-openvino: - needs: [check-release] + needs: [check-release, get-version] if: ${{ needs.check-release.outputs.should_release == 'true' }} runs-on: ubuntu-24.04 @@ -416,9 +445,9 @@ jobs: openvino_version: ${{ steps.openvino_version.outputs.value }} env: - # Sync versions in build.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile - OPENVINO_VERSION_MAJOR: "2026.0" - OPENVINO_VERSION_FULL: "2026.0.0.20965.c6d6a13a886" + # Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile + OPENVINO_VERSION_MAJOR: "2026.2.1" + OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3" steps: - name: Set OpenVINO version output @@ -476,8 +505,12 @@ jobs: source ./openvino_toolkit/setupvars.sh cmake -B build/ReleaseOV -G Ninja \ -DCMAKE_BUILD_TYPE=Release \ - -DGGML_OPENVINO=ON - cmake --build build/ReleaseOV --config Release -j $(nproc) + -DGGML_OPENVINO=ON \ + -DCMAKE_INSTALL_RPATH='$ORIGIN' \ + -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \ + -DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }} \ + ${{ env.CMAKE_ARGS }} + cmake --build build/ReleaseOV --config Release --parallel - name: ccache-clear uses: ./.github/actions/ccache-clear @@ -491,8 +524,26 @@ jobs: - name: Pack artifacts id: pack_artifacts run: | - cp LICENSE ./build/ReleaseOV/bin/ - tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/ReleaseOV/bin . + dest=./build/ReleaseOV/bin + OPENVINO_ROOT=./openvino_toolkit + ov_lib="$OPENVINO_ROOT/runtime/lib/intel64" + + # Bundle OpenVINO runtime libs + TBB. Binaries built with RPATH=$ORIGIN + # load these siblings without setupvars.sh / LD_LIBRARY_PATH. + cp -P "$ov_lib"/libopenvino.so* \ + "$ov_lib"/libopenvino_c.so* \ + "$ov_lib"/libopenvino_*_plugin.so \ + "$ov_lib"/libopenvino_intel_npu_compiler*.so \ + "$OPENVINO_ROOT"/runtime/3rdparty/tbb/lib/*.so* \ + "$dest" + cp -P /usr/lib/x86_64-linux-gnu/libOpenCL.so.1* "$dest" 2>/dev/null || true + cp "$ov_lib"/cache.json "$dest" 2>/dev/null || true + + # OpenVINO licensing + cp -r "$OPENVINO_ROOT"/docs/licensing "$dest"/openvino-licensing + + cp LICENSE "$dest" + tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C "$dest" . - name: Upload artifacts uses: actions/upload-artifact@v6 @@ -500,6 +551,135 @@ jobs: path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz name: llama-bin-ubuntu-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.tar.gz + windows-openvino: + needs: [check-release] + if: ${{ needs.check-release.outputs.should_release == 'true' }} + + runs-on: windows-2022 + + outputs: + openvino_version: ${{ steps.openvino_version.outputs.value }} + + env: + # Sync versions in build-openvino.yml, build-self-hosted.yml, release.yml, build-cache.yml, .devops/openvino.Dockerfile + OPENVINO_VERSION_MAJOR: "2026.2.1" + OPENVINO_VERSION_FULL: "2026.2.1.21919.ede283a88e3" + + steps: + - name: Set OpenVINO version output + id: openvino_version + shell: bash + run: echo "value=${{ env.OPENVINO_VERSION_MAJOR }}" >> $GITHUB_OUTPUT + + - name: Clone + id: checkout + uses: actions/checkout@v6 + with: + fetch-depth: 0 + + - name: Setup Node.js + uses: actions/setup-node@v6 + with: + node-version: "24" + cache: "npm" + cache-dependency-path: "tools/ui/package-lock.json" + + - name: ccache + uses: ggml-org/ccache-action@v1.2.21 + with: + key: release-windows-2022-openvino + variant: ccache + evict-old-files: 1d + + - name: Setup Cache + uses: actions/cache@v5 + id: cache-openvino + with: + path: ./openvino_toolkit + key: cache-gha-openvino-toolkit-v${{ env.OPENVINO_VERSION_FULL }}-${{ runner.os }} + + - name: Setup OpenVINO Toolkit + if: steps.cache-openvino.outputs.cache-hit != 'true' + uses: ./.github/actions/windows-setup-openvino + with: + path: ./openvino_toolkit + version_major: ${{ env.OPENVINO_VERSION_MAJOR }} + version_full: ${{ env.OPENVINO_VERSION_FULL }} + + - name: Install OpenCL using vcpkg + shell: powershell + run: | + git clone https://github.com/microsoft/vcpkg C:\vcpkg + C:\vcpkg\bootstrap-vcpkg.bat + C:\vcpkg\vcpkg install opencl + + - name: Build + id: cmake_build + shell: cmd + run: | + REM Find extracted OpenVINO folder dynamically + for /d %%i in (openvino_toolkit\*) do set OPENVINO_ROOT=%%i + + if not exist "%OPENVINO_ROOT%\runtime\cmake\OpenVINOConfig.cmake" ( + echo ERROR: OpenVINOConfig.cmake not found + exit /b 1 + ) + + call "%OPENVINO_ROOT%\setupvars.bat" + + cmake -B build\ReleaseOV -G "Visual Studio 17 2022" ^ + -A x64 ^ + -DCMAKE_BUILD_TYPE=Release ^ + -DGGML_OPENVINO=ON ^ + -DLLAMA_BUILD_BORINGSSL=ON ^ + -DCMAKE_TOOLCHAIN_FILE=C:\vcpkg\scripts\buildsystems\vcpkg.cmake ^ + ${{ env.CMAKE_ARGS }} + + cmake --build build\ReleaseOV --config Release -- /m + + - name: ccache-clear + uses: ./.github/actions/ccache-clear + with: + key: release-windows-2022-openvino + + - name: Determine tag name + id: tag + uses: ./.github/actions/get-tag-name + + - name: Pack artifacts + id: pack_artifacts + shell: powershell + run: | + # Locate the extracted OpenVINO toolkit root (same pattern as the Build step). + $OPENVINO_ROOT = (Get-ChildItem -Directory openvino_toolkit | Select-Object -First 1).FullName + if (-not $OPENVINO_ROOT) { + Write-Error "OpenVINO toolkit folder not found under .\openvino_toolkit" + exit 1 + } + + $dest = ".\build\ReleaseOV\bin\Release" + + $ovBin = Join-Path $OPENVINO_ROOT 'runtime\bin\intel64\Release' + Copy-Item -Path (Join-Path $ovBin '*.dll') -Destination $dest -Force + Copy-Item -Path (Join-Path $ovBin 'cache.json') -Destination $dest -Force + + $tbbBin = Join-Path $OPENVINO_ROOT 'runtime\3rdparty\tbb\bin' + Copy-Item -Path (Join-Path $tbbBin 'tbb*.dll') -Destination $dest -Force + + # OpenVINO licensing + $licensingDest = Join-Path $dest 'openvino-licensing' + New-Item -ItemType Directory -Force -Path $licensingDest | Out-Null + Copy-Item -Path (Join-Path $OPENVINO_ROOT 'docs\licensing\*') -Destination $licensingDest -Recurse -Force + + Copy-Item LICENSE $dest + 7z a -snl llama-${{ steps.tag.outputs.name }}-bin-win-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.zip $dest\* + + - name: Upload artifacts + uses: actions/upload-artifact@v6 + with: + path: llama-${{ steps.tag.outputs.name }}-bin-win-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.zip + name: llama-bin-win-openvino-${{ env.OPENVINO_VERSION_MAJOR }}-x64.zip + windows-cpu: needs: [check-release] if: ${{ needs.check-release.outputs.should_release == 'true' }} @@ -754,215 +934,211 @@ jobs: path: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip name: cudart-llama-bin-win-cuda-${{ matrix.cuda }}-x64.zip -# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705) -# in order to enable it again, we have to provision dedicated runners to run it -# windows-sycl: -# -# runs-on: windows-2022 -# -# defaults: -# run: -# shell: bash -# -# env: -# WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe -# WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel -# LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip -# ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI" -# ONEAPI_INSTALLER_VERSION: "2025.3.3" -# -# steps: -# - name: Clone -# id: checkout -# uses: actions/checkout@v6 -# -# - name: Use oneAPI Installation Cache -# uses: actions/cache@v5 -# id: cache-sycl -# with: -# path: ${{ env.ONEAPI_ROOT }} -# key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }} -# -# - name: Download & Install oneAPI -# shell: bash -# if: steps.cache-sycl.outputs.cache-hit != 'true' -# run: | -# scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL -# -# - name: Install Level Zero SDK -# shell: pwsh -# run: | -# Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip" -# Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force -# "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append -# -# - name: Setup Node.js -# uses: actions/setup-node@v6 -# with: -# node-version: "24" -# cache: "npm" -# cache-dependency-path: "tools/ui/package-lock.json" -# -# - name: ccache -# uses: ggml-org/ccache-action@v1.2.21 -# with: -# key: release-windows-2022-x64-sycl -# -# - name: Build -# id: cmake_build -# shell: cmd -# run: | -# call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force -# cmake -G "Ninja" -B build ^ -# -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx ^ -# -DCMAKE_BUILD_TYPE=Release ^ -# -DGGML_BACKEND_DL=ON -DBUILD_SHARED_LIBS=ON ^ -# -DGGML_CPU=OFF -DGGML_SYCL=ON ^ -# -DLLAMA_BUILD_BORINGSSL=ON -# cmake --build build --target ggml-sycl -j -# -# - name: Build the release package -# id: pack_artifacts -# run: | -# echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin" -# -# cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin -# cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin -# cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin -# -# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin -# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero_v2.dll" ./build/bin -# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin -# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin -# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin -# ZE_LOADER_DLL=$(find "${{ env.ONEAPI_ROOT }}" "$LEVEL_ZERO_V1_SDK_PATH" -iname ze_loader.dll -print -quit 2>/dev/null || true) -# if [ -n "$ZE_LOADER_DLL" ]; then -# echo "Using Level Zero loader: $ZE_LOADER_DLL" -# cp "$ZE_LOADER_DLL" ./build/bin -# else -# echo "Level Zero loader DLL not found in oneAPI or SDK; relying on system driver/runtime" -# fi -# -# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin -# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin -# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin -# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin -# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl-ls.exe" ./build/bin -# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-fallback-bfloat16.spv" ./build/bin -# cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-native-bfloat16.spv" ./build/bin -# -# cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin -# cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin -# -# cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/tcm.dll" ./build/bin -# cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/libhwloc-15.dll" ./build/bin -# cp "${{ env.ONEAPI_ROOT }}/umf/latest/bin/umf.dll" ./build/bin -# -# echo "cp oneAPI running time dll files to ./build/bin done" -# 7z a -snl llama-bin-win-sycl-x64.zip ./build/bin/* -# -# - name: Upload the release package -# uses: actions/upload-artifact@v6 -# with: -# path: llama-bin-win-sycl-x64.zip -# name: llama-bin-win-sycl-x64.zip + windows-sycl: + needs: [check-release] + if: ${{ needs.check-release.outputs.should_release == 'true' }} -# TODO: this build is disabled to save Github Actions resources (https://github.com/ggml-org/llama.cpp/pull/23705) -# in order to enable it again, we have to provision dedicated runners to run it -# ubuntu-24-sycl: -# -# strategy: -# matrix: -# build: [fp32] -# include: -# - build: fp32 -# fp16: OFF -# -# runs-on: ubuntu-24.04 -# -# env: -# ONEAPI_ROOT: /opt/intel/oneapi/ -# ONEAPI_INSTALLER_VERSION: "2025.3.3" -# LEVEL_ZERO_VERSION: "1.28.2" -# LEVEL_ZERO_UBUNTU_VERSION: "u24.04" -# -# steps: -# - name: Clone -# id: checkout -# uses: actions/checkout@v6 -# with: -# fetch-depth: 0 -# -# - name: Use oneAPI Installation Cache -# uses: actions/cache@v5 -# id: cache-sycl -# with: -# path: ${{ env.ONEAPI_ROOT }} -# key: cache-gha-oneAPI-${{ env.ONEAPI_INSTALLER_VERSION }}-${{ runner.os }} -# -# - name: Download & Install oneAPI -# shell: bash -# if: steps.cache-sycl.outputs.cache-hit != 'true' -# run: | -# cd /tmp -# wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh -# sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept -# -# - name: Install Level Zero SDK -# shell: bash -# run: | -# cd /tmp -# wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb -# wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb -# sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb -# -# - name: Setup Node.js -# uses: actions/setup-node@v6 -# with: -# node-version: "24" -# cache: "npm" -# cache-dependency-path: "tools/ui/package-lock.json" -# -# - name: ccache -# uses: ggml-org/ccache-action@v1.2.21 -# with: -# key: release-ubuntu-24.04-sycl -# -# - name: Build -# id: cmake_build -# run: | -# source /opt/intel/oneapi/setvars.sh -# cmake -B build \ -# -G "Ninja" \ -# -DCMAKE_BUILD_TYPE=Release \ -# -DGGML_SYCL=ON \ -# -DCMAKE_C_COMPILER=icx \ -# -DCMAKE_CXX_COMPILER=icpx \ -# -DLLAMA_OPENSSL=OFF \ -# -DGGML_NATIVE=OFF \ -# -DGGML_SYCL_F16=${{ matrix.fp16 }} -# time cmake --build build --config Release -j $(nproc) -# -# - name: Determine tag name -# id: tag -# uses: ./.github/actions/get-tag-name -# -# - name: Pack artifacts -# id: pack_artifacts -# run: | -# cp LICENSE ./build/bin/ -# tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin . -# -# - name: Upload artifacts -# uses: actions/upload-artifact@v6 -# with: -# path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz -# name: llama-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz + runs-on: windows-2022 - ubuntu-22-rocm: + defaults: + run: + shell: bash + + env: + WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b60765d1-2b85-4e85-86b6-cb0e9563a699/intel-deep-learning-essentials-2025.3.3.18_offline.exe + WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel:intel.oneapi.win.dnnl:intel.oneapi.win.tbb.devel + LEVEL_ZERO_SDK_URL: https://github.com/oneapi-src/level-zero/releases/download/v1.28.2/level-zero-win-sdk-1.28.2.zip + ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI" + ONEAPI_INSTALLER_VERSION: "2025.3.3" + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v6 + + - name: Download & Install oneAPI + shell: bash + run: | + scripts/install-oneapi.bat $WINDOWS_BASEKIT_URL $WINDOWS_DPCPP_MKL + + - name: Install Level Zero SDK + shell: pwsh + run: | + Invoke-WebRequest -Uri "${{ env.LEVEL_ZERO_SDK_URL }}" -OutFile "level-zero-win-sdk.zip" + Expand-Archive -Path "level-zero-win-sdk.zip" -DestinationPath "C:/level-zero-sdk" -Force + "LEVEL_ZERO_V1_SDK_PATH=C:/level-zero-sdk" | Out-File -FilePath $env:GITHUB_ENV -Append + + - name: Setup Node.js + uses: actions/setup-node@v6 + with: + node-version: "24" + cache: "npm" + cache-dependency-path: "tools/ui/package-lock.json" + + - name: ccache + uses: ggml-org/ccache-action@v1.2.21 + with: + key: release-windows-2022-x64-sycl + + - name: Build + id: cmake_build + shell: cmd + run: | + call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force + cmake -G "Ninja" -B build ^ + -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx ^ + -DCMAKE_BUILD_TYPE=Release ^ + -DGGML_BACKEND_DL=ON -DBUILD_SHARED_LIBS=ON ^ + -DGGML_CPU=OFF -DGGML_SYCL=ON ^ + -DLLAMA_BUILD_BORINGSSL=ON + cmake --build build --target ggml-sycl -j %NUMBER_OF_PROCESSORS% + + - name: ccache-clear + uses: ./.github/actions/ccache-clear + with: + key: release-windows-2022-x64-sycl + + - name: Build the release package + id: pack_artifacts + run: | + echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin" + + cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.5.dll" ./build/bin + cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin + cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin + + cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero.dll" ./build/bin + cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_level_zero_v2.dll" ./build/bin + cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_adapter_opencl.dll" ./build/bin + cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_loader.dll" ./build/bin + cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/ur_win_proxy_loader.dll" ./build/bin + ZE_LOADER_DLL=$(find "${{ env.ONEAPI_ROOT }}" "$LEVEL_ZERO_V1_SDK_PATH" -iname ze_loader.dll -print -quit 2>/dev/null || true) + if [ -n "$ZE_LOADER_DLL" ]; then + echo "Using Level Zero loader: $ZE_LOADER_DLL" + cp "$ZE_LOADER_DLL" ./build/bin + else + echo "Level Zero loader DLL not found in oneAPI or SDK; relying on system driver/runtime" + fi + + cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl8.dll" ./build/bin + cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin + cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin + cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libiomp5md.dll" ./build/bin + cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl-ls.exe" ./build/bin + cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-fallback-bfloat16.spv" ./build/bin + cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libsycl-native-bfloat16.spv" ./build/bin + + cp "${{ env.ONEAPI_ROOT }}/dnnl/latest/bin/dnnl.dll" ./build/bin + cp "${{ env.ONEAPI_ROOT }}/tbb/latest/bin/tbb12.dll" ./build/bin + + cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/tcm.dll" ./build/bin + cp "${{ env.ONEAPI_ROOT }}/tcm/latest/bin/libhwloc-15.dll" ./build/bin + cp "${{ env.ONEAPI_ROOT }}/umf/latest/bin/umf.dll" ./build/bin + + echo "cp oneAPI running time dll files to ./build/bin done" + 7z a -snl llama-bin-win-sycl-x64.zip ./build/bin/* + + - name: Upload the release package + uses: actions/upload-artifact@v6 + with: + path: llama-bin-win-sycl-x64.zip + name: llama-bin-win-sycl-x64.zip + + ubuntu-24-sycl: needs: [check-release] if: ${{ needs.check-release.outputs.should_release == 'true' }} + strategy: + matrix: + build: [fp32, fp16] + include: + - build: fp32 + fp16: OFF + - build: fp16 + fp16: ON + + runs-on: ubuntu-24.04 + + env: + ONEAPI_ROOT: /opt/intel/oneapi/ + ONEAPI_INSTALLER_VERSION: "2025.3.3" + LEVEL_ZERO_VERSION: "1.28.2" + LEVEL_ZERO_UBUNTU_VERSION: "u24.04" + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v6 + with: + fetch-depth: 0 + + - name: Download & Install oneAPI + shell: bash + run: | + cd /tmp + wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/56f7923a-adb8-43f3-8b02-2b60fcac8cab/intel-deep-learning-essentials-2025.3.3.16_offline.sh -O intel-deep-learning-essentials_offline.sh + sudo bash intel-deep-learning-essentials_offline.sh -s -a --silent --eula accept + + - name: Install Level Zero SDK + shell: bash + run: | + cd /tmp + wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero.deb + wget -q "https://github.com/oneapi-src/level-zero/releases/download/v${LEVEL_ZERO_VERSION}/level-zero-devel_${LEVEL_ZERO_VERSION}%2B${LEVEL_ZERO_UBUNTU_VERSION}_amd64.deb" -O level-zero-devel.deb + sudo apt-get install -y ./level-zero.deb ./level-zero-devel.deb + + - name: Setup Node.js + uses: actions/setup-node@v6 + with: + node-version: "24" + cache: "npm" + cache-dependency-path: "tools/ui/package-lock.json" + + - name: ccache + uses: ggml-org/ccache-action@v1.2.21 + with: + key: release-ubuntu-24.04-sycl-${{ matrix.build }} + + - name: Build + id: cmake_build + run: | + source /opt/intel/oneapi/setvars.sh + cmake -B build \ + -G "Ninja" \ + -DCMAKE_BUILD_TYPE=Release \ + -DGGML_SYCL=ON \ + -DCMAKE_C_COMPILER=icx \ + -DCMAKE_CXX_COMPILER=icpx \ + -DLLAMA_OPENSSL=OFF \ + -DGGML_NATIVE=OFF \ + -DGGML_SYCL_F16=${{ matrix.fp16 }} + time cmake --build build --config Release -j $(nproc) + + - name: ccache-clear + uses: ./.github/actions/ccache-clear + with: + key: release-ubuntu-24.04-sycl-${{ matrix.build }} + + - name: Determine tag name + id: tag + uses: ./.github/actions/get-tag-name + + - name: Pack artifacts + id: pack_artifacts + run: | + cp LICENSE ./build/bin/ + tar -czvf llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz --transform "s,^\.,llama-${{ steps.tag.outputs.name }}," -C ./build/bin . + + - name: Upload artifacts + uses: actions/upload-artifact@v6 + with: + path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz + name: llama-bin-ubuntu-sycl-${{ matrix.build }}-x64.tar.gz + + ubuntu-22-rocm: + needs: [check-release, get-version] + if: ${{ needs.check-release.outputs.should_release == 'true' }} + runs-on: ubuntu-22.04 permissions: @@ -1052,6 +1228,7 @@ jobs: -DGGML_HIP=ON \ -DHIP_PLATFORM=amd \ -DGGML_HIP_ROCWMMA_FATTN=ON \ + -DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }} \ ${{ env.CMAKE_ARGS }} cmake --build build --config Release -j $(nproc) @@ -1080,7 +1257,7 @@ jobs: name: llama-bin-ubuntu-rocm-${{ env.ROCM_VERSION_SHORT }}-${{ matrix.build }}.tar.gz windows-hip: - needs: [check-release] + needs: [check-release, get-version] if: ${{ needs.check-release.outputs.should_release == 'true' }} runs-on: windows-2022 @@ -1176,6 +1353,7 @@ jobs: -DGPU_TARGETS="${{ matrix.gpu_targets }}" ` -DGGML_HIP_ROCWMMA_FATTN=ON ` -DGGML_HIP=ON ` + -DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }} ` -DLLAMA_BUILD_BORINGSSL=ON cmake --build build --target ggml-hip -j ${env:NUMBER_OF_PROCESSORS} md "build\bin\rocblas\library\" @@ -1203,7 +1381,7 @@ jobs: name: llama-bin-win-hip-${{ matrix.name }}-x64.zip ios-xcode: - needs: [check-release] + needs: [check-release, get-version] if: ${{ needs.check-release.outputs.should_release == 'true' }} runs-on: macos-26 @@ -1232,7 +1410,8 @@ jobs: -DLLAMA_BUILD_SERVER=OFF \ -DCMAKE_SYSTEM_NAME=iOS \ -DCMAKE_OSX_DEPLOYMENT_TARGET=16.0 \ - -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml + -DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml \ + -DHF_UI_VERSION=${{ needs.get-version.outputs.ui_version }} cmake --build build --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO - name: xcodebuild for swift package @@ -1352,10 +1531,12 @@ jobs: # path: llama-${{ steps.tag.outputs.name }}-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz # name: llama-bin-${{ matrix.chip_type }}-openEuler-${{ matrix.arch }}${{ matrix.use_acl_graph == 'on' && '-aclgraph' || '' }}.tar.gz - ui: - needs: [check-release] + ui-build: + needs: [check-release, get-version] if: ${{ needs.check-release.outputs.should_release == 'true' }} uses: ./.github/workflows/ui-build.yml + with: + hf_ui_version: ${{ needs.get-version.outputs.ui_version }} release: if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} @@ -1368,11 +1549,13 @@ jobs: runs-on: ubuntu-slim needs: + - get-version - windows - windows-cpu - windows-cuda #- windows-sycl - windows-hip + - windows-openvino - ubuntu-22-rocm - ubuntu-cpu - ubuntu-vulkan @@ -1382,7 +1565,7 @@ jobs: - macos-cpu - ios-xcode #- openEuler-cann - - ui + - ui-build outputs: tag_name: ${{ steps.tag.outputs.name }} @@ -1482,7 +1665,8 @@ jobs: - [Ubuntu arm64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-vulkan-arm64.tar.gz) - [Ubuntu x64 (ROCm 7.2)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-rocm-7.2-x64.tar.gz) - [Ubuntu x64 (OpenVINO)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-openvino-${{ needs.ubuntu-24-openvino.outputs.openvino_version }}-x64.tar.gz) - - Ubuntu x64 (SYCL FP32) [DISABLED](https://github.com/ggml-org/llama.cpp/pull/23705) + - [Ubuntu x64 (SYCL FP32)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-fp32-x64.tar.gz) + - [Ubuntu x64 (SYCL FP16)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-ubuntu-sycl-fp16-x64.tar.gz) **Android:** - [Android arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-android-arm64.tar.gz) @@ -1490,10 +1674,12 @@ jobs: **Windows:** - [Windows x64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-x64.zip) - [Windows arm64 (CPU)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cpu-arm64.zip) + - [Windows arm64 (OpenCL Adreno)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-opencl-adreno-arm64.zip) - [Windows x64 (CUDA 12)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-12.4-x64.zip) - [CUDA 12.4 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-12.4-x64.zip) - [Windows x64 (CUDA 13)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-cuda-13.3-x64.zip) - [CUDA 13.3 DLLs](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/cudart-llama-bin-win-cuda-13.3-x64.zip) - [Windows x64 (Vulkan)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-vulkan-x64.zip) - - Windows x64 (SYCL) [DISABLED](https://github.com/ggml-org/llama.cpp/pull/23705) + - [Windows x64 (OpenVINO)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-openvino-${{ needs.windows-openvino.outputs.openvino_version }}-x64.zip) + - [Windows x64 (SYCL)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip) - [Windows x64 (HIP)](https://github.com/ggml-org/llama.cpp/releases/download/${{ steps.tag.outputs.name }}/llama-${{ steps.tag.outputs.name }}-bin-win-hip-radeon-x64.zip) **openEuler:** diff --git a/.github/workflows/ui-build-self-hosted.yml b/.github/workflows/ui-build-self-hosted.yml index e5d576cda625..7b7f8b60025a 100644 --- a/.github/workflows/ui-build-self-hosted.yml +++ b/.github/workflows/ui-build-self-hosted.yml @@ -28,13 +28,6 @@ jobs: run: npm run build working-directory: tools/ui - - name: Generate checksums - run: | - cd tools/ui/dist - for f in *; do - sha256sum "$f" | awk '{print $1, $2}' >> checksums.txt - done - - name: Upload built UI uses: actions/upload-artifact@v6 with: diff --git a/.github/workflows/ui-build.yml b/.github/workflows/ui-build.yml index 92b0573fb8d2..85642f3f4b76 100644 --- a/.github/workflows/ui-build.yml +++ b/.github/workflows/ui-build.yml @@ -2,6 +2,11 @@ name: UI Build on: workflow_call: + inputs: + hf_ui_version: + description: 'Version string for version.json (e.g. 12345)' + required: false + type: string jobs: build: @@ -25,15 +30,15 @@ jobs: working-directory: tools/ui - name: Build application + env: + HF_UI_VERSION: ${{ inputs.hf_ui_version || '' }} + LLAMA_BUILD_NUMBER: ${{ inputs.hf_ui_version || 'b0000' }} run: npm run build working-directory: tools/ui - - name: Generate checksums - run: | - cd tools/ui/dist - for f in *; do - sha256sum "$f" | awk '{print $1, $2}' >> checksums.txt - done + - name: Run PWA unit tests (versioned build output) + run: npx vitest --project=unit --run tests/unit/pwa.spec.ts + working-directory: tools/ui - name: Upload built UI uses: actions/upload-artifact@v6 diff --git a/.github/workflows/ui-publish.yml b/.github/workflows/ui-publish.yml index cec0fa52a122..c3b7343c655b 100644 --- a/.github/workflows/ui-publish.yml +++ b/.github/workflows/ui-publish.yml @@ -40,6 +40,12 @@ jobs: name: ui-build path: tools/ui/dist/ + - name: Create distribution archive + run: | + tar -czf dist.tar.gz -C tools/ui/dist . + sha256sum dist.tar.gz > dist.tar.gz.sha256 + mv dist.tar.gz dist.tar.gz.sha256 tools/ui/dist/ + - name: Install Hugging Face Hub CLI run: pip install -U huggingface_hub diff --git a/.github/workflows/ui-self-hosted.yml b/.github/workflows/ui-self-hosted.yml index 5457d900c872..79d7800d6bbd 100644 --- a/.github/workflows/ui-self-hosted.yml +++ b/.github/workflows/ui-self-hosted.yml @@ -1,8 +1,8 @@ name: UI (self-hosted) # these are the same as ui.yml, but with self-hosted runners -# the runners come with pre-installed Playwright browsers version: 1.56.1 -# the jobs are much lighter because they don't need to install node and playwright browsers +# the jobs are lighter because they don't need to install Node.js or Playwright browsers +# the runner has pre-installed Playwright browsers for @playwright/test (1.56.1) at /ms-playwright/ on: workflow_dispatch: @@ -61,6 +61,12 @@ jobs: run: npm ci working-directory: tools/ui + - name: Download built UI artifacts + uses: actions/download-artifact@v6 + with: + name: ui-build + path: tools/ui/dist/ + - name: Run type checking if: ${{ always() && steps.setup.conclusion == 'success' }} run: npm run check @@ -72,12 +78,12 @@ jobs: working-directory: tools/ui - name: Run Client tests - if: ${{ always() }} + if: ${{ always() && steps.setup.conclusion == 'success' }} run: npm run test:client working-directory: tools/ui - name: Run Unit tests - if: ${{ always() }} + if: ${{ always() && steps.setup.conclusion == 'success' }} run: npm run test:unit working-directory: tools/ui @@ -97,22 +103,23 @@ jobs: run: npm ci working-directory: tools/ui - - name: Build application - if: ${{ always() && steps.setup.conclusion == 'success' }} - run: npm run build - working-directory: tools/ui + - name: Download built UI artifacts + uses: actions/download-artifact@v6 + with: + name: ui-build + path: tools/ui/dist/ - name: Build Storybook - if: ${{ always() }} + if: ${{ always() && steps.setup.conclusion == 'success' }} run: npm run build-storybook working-directory: tools/ui - name: Run UI tests - if: ${{ always() }} + if: ${{ always() && steps.setup.conclusion == 'success' }} run: npm run test:ui -- --testTimeout=60000 working-directory: tools/ui - name: Run E2E tests - if: ${{ always() }} + if: ${{ always() && steps.setup.conclusion == 'success' }} run: npm run test:e2e working-directory: tools/ui diff --git a/.github/workflows/ui.yml b/.github/workflows/ui.yml index b3712e450592..fa99a0cda2cf 100644 --- a/.github/workflows/ui.yml +++ b/.github/workflows/ui.yml @@ -43,7 +43,7 @@ jobs: ui-checks: name: Checks needs: ui-build - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 continue-on-error: true steps: - name: Checkout code @@ -60,6 +60,12 @@ jobs: cache: "npm" cache-dependency-path: "tools/ui/package-lock.json" + - name: Download built UI artifacts + uses: actions/download-artifact@v6 + with: + name: ui-build + path: tools/ui/dist/ + - name: Install dependencies id: setup if: ${{ steps.node.conclusion == 'success' }} @@ -87,7 +93,7 @@ jobs: run: npm run test:client working-directory: tools/ui - - name: Run Unit tests + - name: Run Unit tests (uses pre-built dist/ from ui-build) if: ${{ always() && steps.playwright.conclusion == 'success' }} run: npm run test:unit working-directory: tools/ui @@ -95,7 +101,7 @@ jobs: e2e-tests: name: E2E Tests needs: ui-build - runs-on: ubuntu-latest + runs-on: ubuntu-24.04 steps: - name: Checkout code uses: actions/checkout@v6 @@ -117,10 +123,11 @@ jobs: run: npm ci working-directory: tools/ui - - name: Build application - if: ${{ always() && steps.setup.conclusion == 'success' }} - run: npm run build - working-directory: tools/ui + - name: Download built UI artifacts (reuses ui-build) + uses: actions/download-artifact@v6 + with: + name: ui-build + path: tools/ui/dist/ - name: Install Playwright browsers id: playwright @@ -138,7 +145,7 @@ jobs: run: npm run test:ui -- --testTimeout=60000 working-directory: tools/ui - - name: Run E2E tests + - name: Run E2E tests (uses pre-built dist/ from ui-build) if: ${{ always() && steps.playwright.conclusion == 'success' }} run: npm run test:e2e working-directory: tools/ui diff --git a/.gitignore b/.gitignore index 8dc9d7d0b8dd..9b589615a402 100644 --- a/.gitignore +++ b/.gitignore @@ -92,13 +92,6 @@ !/examples/sycl/*.bat !/examples/sycl/*.sh -# Server Web UI temporary files (+ legacy directory) - -/tools/server/webui/node_modules -/tools/server/webui/dist -/tools/ui/node_modules -/tools/ui/dist - # Python /.venv diff --git a/.pi/gg/SYSTEM.md b/.pi/gg/SYSTEM.md index 197173faed80..17ce71cc1b5a 100644 --- a/.pi/gg/SYSTEM.md +++ b/.pi/gg/SYSTEM.md @@ -25,13 +25,3 @@ Commits: - Do not explicitly set the git author in commits - rely on the default git config - Always use `--no-gpg-sign` when committing - Never `git push` without explicit confirmation from the user - -Resources (read on demand): -- [CONTRIBUTING.md](CONTRIBUTING.md) -- [Build documentation](docs/build.md) -- [Server usage documentation](tools/server/README.md) -- [Server development documentation](tools/server/README-dev.md) -- [PEG parser](docs/development/parsing.md) -- [Auto parser](docs/autoparser.md) -- [Jinja engine](common/jinja/README.md) -- [PR template](.github/pull_request_template.md) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9e7b1253c726..81f23d7e70b7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -222,6 +222,16 @@ if (LLAMA_BUILD_APP) add_subdirectory(app) endif() +# Standalone libmtmd build without pulling in the rest of the tools/ tree. +# Useful when packaging just the mtmd library for language bindings (e.g. an +# Apple XCFramework, or a WASM build). When the full tools build is enabled, +# mtmd is already built by the tools/ subdirectory above; this hook only fires +# when LLAMA_BUILD_TOOLS is OFF to avoid double-adding the target. +option(LLAMA_BUILD_MTMD "llama: build tools/mtmd library standalone" OFF) +if (LLAMA_BUILD_MTMD AND NOT (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TOOLS)) + add_subdirectory(tools/mtmd) +endif() + # # install # diff --git a/CODEOWNERS b/CODEOWNERS index 4b9d90177154..46fd518b7e51 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -10,7 +10,7 @@ # ggml-org/ggml-rpc : rgerganov # ggml-org/ggml-sycl : arthw # ggml-org/ggml-vulkan : 0cc4m, jeffbolznv -# ggml-org/ggml-webgpu : reeselevine +# ggml-org/ggml-webgpu : reeselevine, yomaytk # ggml-org/ggml-zdnn : taronaeo # ggml-org/llama-common : ggerganov, aldehir, angt, danbev, ngxson, pwilkin # ggml-org/llama-mtmd : ngxson diff --git a/README.md b/README.md index d9f2e18231c7..e98f2b7f18b1 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # llama.cpp -![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png) +![llama](https://raw.githubusercontent.com/ggml-org/llama.brand/refs/heads/master/cover/llama-cpp/cover-llama-cpp-dark.svg) [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT) [![Release](https://img.shields.io/github/v/release/ggml-org/llama.cpp)](https://github.com/ggml-org/llama.cpp/releases) @@ -37,7 +37,7 @@ LLM inference in C/C++ Getting started with llama.cpp is straightforward. Here are several ways to install it on your machine: -- Install `llama.cpp` using [brew, nix or winget](docs/install.md) +- Install `llama.cpp` using [brew, nix, winget, or conda-forge](docs/install.md) - Run with Docker - see our [Docker documentation](docs/docker.md) - Download pre-built binaries from the [releases page](https://github.com/ggml-org/llama.cpp/releases) - Build from source by cloning this repository - check out [our build guide](docs/build.md) @@ -142,7 +142,9 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo - [x] [GigaChat-20B-A3B](https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct) - [X] [Trillion-7B-preview](https://huggingface.co/trillionlabs/Trillion-7B-preview) - [x] [Ling models](https://huggingface.co/collections/inclusionAI/ling-67c51c85b34a7ea0aba94c32) -- [x] [LFM2 models](https://huggingface.co/collections/LiquidAI/lfm2-686d721927015b2ad73eaa38) +- [x] [Liquid LFM2 models](https://huggingface.co/collections/LiquidAI/lfm2) +- [x] [Liquid LFM2.5 models](https://huggingface.co/collections/LiquidAI/lfm25) +- [x] [Liquid Nanos](https://huggingface.co/collections/LiquidAI/liquid-nanos) - [x] [Hunyuan models](https://huggingface.co/collections/tencent/hunyuan-dense-model-6890632cda26b19119c9c5e7) - [x] [BailingMoeV2 (Ring/Ling 2.0) models](https://huggingface.co/collections/inclusionAI/ling-v2-68bf1dd2fc34c306c1fa6f86) - [x] [Mellum models](https://huggingface.co/JetBrains/models?search=mellum) diff --git a/SECURITY.md b/SECURITY.md index a98b8e70bd88..0e704e3280f6 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -80,7 +80,7 @@ To protect sensitive data from potential leaks or unauthorized access, it is cru ### Untrusted environments or networks If you can't run your models in a secure and isolated environment or if it must be exposed to an untrusted network, make sure to take the following security precautions: -* Do not use the RPC backend, [rpc-server](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) and [llama-server](https://github.com/ggml-org/llama.cpp/tree/master/tools/server) functionality (see https://github.com/ggml-org/llama.cpp/pull/13061). +* Do not use the RPC backend, [ggml-rpc-server](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) and [llama-server](https://github.com/ggml-org/llama.cpp/tree/master/tools/server) functionality (see https://github.com/ggml-org/llama.cpp/pull/13061). * Confirm the hash of any downloaded artifact (e.g. pre-trained model weights) matches a known-good value. * Encrypt your data if sending it over the network. diff --git a/app/CMakeLists.txt b/app/CMakeLists.txt index 3ce503955b36..3450ff49000f 100644 --- a/app/CMakeLists.txt +++ b/app/CMakeLists.txt @@ -1,6 +1,6 @@ set(TARGET llama-app) -add_executable(${TARGET} llama.cpp) +add_executable(${TARGET} llama.cpp download.cpp) set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama) target_link_libraries(${TARGET} PRIVATE diff --git a/app/download.cpp b/app/download.cpp new file mode 100644 index 000000000000..7227baadcb18 --- /dev/null +++ b/app/download.cpp @@ -0,0 +1,71 @@ +#include "arg.h" +#include "common.h" +#include "download.h" +#include "log.h" + +#include +#include + +static void print_usage(int /*argc*/, char ** argv) { + printf( + "\nexamples:\n" + " %s -hf ggml-org/gemma-3-4b-it-qat-GGUF\n" + " %s -hf ggml-org/gemma-3-4b-it-qat-GGUF:Q4_K_M\n" + " %s -hf ggml-org/models -hff model.gguf\n" + " %s -mu https://example.com/model.gguf -m model.gguf\n" + "\n", + argv[0], argv[0], argv[0], argv[0] + ); +} + +int llama_download(int argc, char ** argv); + +int llama_download(int argc, char ** argv) { + common_init(); + + common_params params; + params.verbosity = LOG_LEVEL_ERROR; + + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_DOWNLOAD, print_usage)) { + return 1; + } + + const bool has_source = !params.model.hf_repo.empty() || !params.model.url.empty() || + !params.model.path.empty() || !params.model.docker_repo.empty(); + if (!has_source) { + fprintf(stderr, "error: no model source specified (use --hf-repo, --model-url, --model or --docker-repo)\n"); + return 1; + } + + try { + common_models_handler handler = common_models_handler_init(params, LLAMA_EXAMPLE_DOWNLOAD); + common_models_handler_apply(handler, params); + } catch (const std::exception & e) { + fprintf(stderr, "error: %s\n", e.what()); + return 1; + } + + if (!params.models_preset.empty()) { + // -hf pointed at a preset repo: print the preset path and stop + printf("%s\n", params.models_preset.c_str()); + return 0; + } + if (params.model.path.empty()) { + fprintf(stderr, "error: model download failed\n"); + return 1; + } + if (!std::filesystem::exists(params.model.path)) { + fprintf(stderr, "error: model file does not exist: %s\n", params.model.path.c_str()); + return 1; + } + + printf("%s\n", params.model.path.c_str()); + if (!params.mmproj.path.empty()) { + printf("%s\n", params.mmproj.path.c_str()); + } + if (!params.speculative.draft.mparams.path.empty()) { + printf("%s\n", params.speculative.draft.mparams.path.c_str()); + } + + return 0; +} diff --git a/app/llama.cpp b/app/llama.cpp index 30b09f9ef7e3..2cf1aa876ce0 100644 --- a/app/llama.cpp +++ b/app/llama.cpp @@ -19,17 +19,23 @@ int llama_batched_bench(int argc, char ** argv); int llama_fit_params(int argc, char ** argv); int llama_quantize(int argc, char ** argv); int llama_perplexity(int argc, char ** argv); +int llama_download(int argc, char ** argv); -// hands the update over to the install script, which downloads and swaps the binary +// Self-update is only supported for binaries built with llama-install.sh static int llama_update(int argc, char ** argv) { (void) argc; (void) argv; +#ifdef LLAMA_INSTALL_BUILD #if defined(_WIN32) return system("powershell -NoProfile -ExecutionPolicy Bypass -Command \"irm https://llama.app/install.ps1 | iex\""); #else return system("curl -fsSL https://llama.app/install.sh | sh"); #endif +#else + printf("Updates are available only when installed from https://llama.app\n"); + return 1; +#endif } static const char * progname; @@ -44,23 +50,33 @@ struct command { std::vector aliases; bool hidden; int (*func)(int, char **); + bool flags = false; // allow --name }; +#ifdef LLAMA_INSTALL_BUILD +#define UPDATE_HIDDEN false +#else +#define UPDATE_HIDDEN true +#endif + static const command cmds[] = { - {"serve", "HTTP API server", {"server"}, false, llama_server }, - {"cli", "Command-line interactive interface", {"client"}, false, llama_cli }, - {"update", "Update llama to the latest release", {}, false, llama_update }, - {"completion", "Text completion", {"complete"}, true, llama_completion }, - {"bench", "Benchmark prompt processing and text generation", {}, true, llama_bench }, - {"batched-bench", "Benchmark batched decoding performance", {}, true, llama_batched_bench}, - {"fit-params", "Compute parameters to fit a model in device memory", {}, true, llama_fit_params }, - {"quantize", "Quantize a model", {}, true, llama_quantize }, - {"perplexity", "Compute model perplexity and KL divergence", {}, true, llama_perplexity }, - {"version", "Show version", {}, false, version }, - {"licenses", "Show third-party licenses", {"credits"}, false, licenses }, - {"help", "Show available commands", {}, false, help }, + {"serve", "HTTP API server", {"server"}, false, llama_server }, + {"cli", "Command-line interactive interface", {"client"}, false, llama_cli }, + {"update", "Update llama to the latest release", {}, UPDATE_HIDDEN, llama_update }, + {"download", "Download a model", {"get"}, false, llama_download }, + {"completion", "Text completion", {"complete"}, true, llama_completion }, + {"bench", "Benchmark prompt processing and text generation", {}, true, llama_bench }, + {"batched-bench", "Benchmark batched decoding performance", {}, true, llama_batched_bench}, + {"fit-params", "Compute parameters to fit a model in device memory", {}, true, llama_fit_params }, + {"quantize", "Quantize a model", {}, true, llama_quantize }, + {"perplexity", "Compute model perplexity and KL divergence", {}, true, llama_perplexity }, + {"version", "Show version", {}, false, version, true }, + {"licenses", "Show third-party licenses", {"credits"}, false, licenses, true }, + {"help", "Show available commands", {}, false, help, true }, }; +#undef UPDATE_HIDDEN + static int version(int argc, char ** argv) { printf("%s\n", llama_build_info()); return 0; @@ -93,7 +109,10 @@ static int help(int argc, char ** argv) { return 0; } -static bool matches(const std::string & arg, const command & cmd) { +static bool matches(std::string arg, const command & cmd) { + if (cmd.flags && arg.size() > 2 && arg[0] == '-' && arg[1] == '-') { + arg.erase(0, 2); + } if (arg == cmd.name) { return true; } diff --git a/build-xcframework.sh b/build-xcframework.sh index 180c01a88e94..697278d050c5 100755 --- a/build-xcframework.sh +++ b/build-xcframework.sh @@ -13,6 +13,7 @@ LLAMA_BUILD_EXAMPLES=OFF LLAMA_BUILD_TOOLS=OFF LLAMA_BUILD_TESTS=OFF LLAMA_BUILD_SERVER=OFF +LLAMA_BUILD_MTMD=ON GGML_METAL=ON GGML_METAL_EMBED_LIBRARY=ON GGML_BLAS_DEFAULT=ON @@ -39,6 +40,7 @@ COMMON_CMAKE_ARGS=( -DLLAMA_BUILD_TOOLS=${LLAMA_BUILD_TOOLS} -DLLAMA_BUILD_TESTS=${LLAMA_BUILD_TESTS} -DLLAMA_BUILD_SERVER=${LLAMA_BUILD_SERVER} + -DLLAMA_BUILD_MTMD=${LLAMA_BUILD_MTMD} -DGGML_METAL_EMBED_LIBRARY=${GGML_METAL_EMBED_LIBRARY} -DGGML_BLAS_DEFAULT=${GGML_BLAS_DEFAULT} -DGGML_METAL=${GGML_METAL} @@ -126,6 +128,8 @@ setup_framework_structure() { cp ggml/include/ggml-cpu.h ${header_path} cp ggml/include/ggml-blas.h ${header_path} cp ggml/include/gguf.h ${header_path} + cp tools/mtmd/mtmd.h ${header_path} + cp tools/mtmd/mtmd-helper.h ${header_path} # Create module map (common for all platforms) cat > ${module_path}module.modulemap << EOF @@ -247,6 +251,7 @@ combine_static_libraries() { "${base_dir}/${build_dir}/ggml/src/${release_dir}/libggml-cpu.a" "${base_dir}/${build_dir}/ggml/src/ggml-metal/${release_dir}/libggml-metal.a" "${base_dir}/${build_dir}/ggml/src/ggml-blas/${release_dir}/libggml-blas.a" + "${base_dir}/${build_dir}/tools/mtmd/${release_dir}/libmtmd.a" ) # Create temporary directory for processing @@ -410,6 +415,7 @@ cmake -B build-ios-sim -G Xcode \ -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \ -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \ -DLLAMA_OPENSSL=OFF \ + -DMTMD_VIDEO=OFF \ -S . cmake --build build-ios-sim --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet @@ -424,6 +430,7 @@ cmake -B build-ios-device -G Xcode \ -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \ -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \ -DLLAMA_OPENSSL=OFF \ + -DMTMD_VIDEO=OFF \ -S . cmake --build build-ios-device --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet @@ -450,6 +457,7 @@ cmake -B build-visionos -G Xcode \ -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \ -DLLAMA_OPENSSL=OFF \ -DLLAMA_BUILD_SERVER=OFF \ + -DMTMD_VIDEO=OFF \ -S . cmake --build build-visionos --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet @@ -465,6 +473,7 @@ cmake -B build-visionos-sim -G Xcode \ -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \ -DLLAMA_OPENSSL=OFF \ -DLLAMA_BUILD_SERVER=OFF \ + -DMTMD_VIDEO=OFF \ -S . cmake --build build-visionos-sim --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet @@ -481,6 +490,7 @@ cmake -B build-tvos-sim -G Xcode \ -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \ -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \ -DLLAMA_OPENSSL=OFF \ + -DMTMD_VIDEO=OFF \ -S . cmake --build build-tvos-sim --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet @@ -496,6 +506,7 @@ cmake -B build-tvos-device -G Xcode \ -DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \ -DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \ -DLLAMA_OPENSSL=OFF \ + -DMTMD_VIDEO=OFF \ -S . cmake --build build-tvos-device --config Release -j $(sysctl -n hw.logicalcpu) -- -quiet diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index c42320c46b16..fc16b21cf1c4 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -80,8 +80,6 @@ add_library(${TARGET} http.h imatrix-loader.cpp imatrix-loader.h - json-partial.cpp - json-partial.h json-schema-to-grammar.cpp llguidance.cpp log.cpp diff --git a/common/arg.cpp b/common/arg.cpp index 55795d357d90..841ca3ce2ec2 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -17,6 +17,7 @@ # define NOMINMAX #endif #include +#include #endif #define JSON_ASSERT GGML_ASSERT @@ -285,107 +286,16 @@ static std::string clean_file_name(const std::string & fname) { return clean_fname; } -static bool common_params_handle_remote_preset(common_params & params, llama_example ex) { - GGML_ASSERT(!params.model.hf_repo.empty()); - - // the returned hf_repo is without tag - auto [hf_repo, hf_tag] = common_download_split_repo_tag(params.model.hf_repo); - - // "latest" tag (default if not specified) is translated to "default" preset - if (hf_tag == "latest") { - hf_tag = "default"; - } - - std::string model_endpoint = common_get_model_endpoint(); - auto preset_url = model_endpoint + hf_repo + "/resolve/main/preset.ini"; - - // prepare local path for caching - auto preset_fname = clean_file_name(hf_repo + "_preset.ini"); - auto preset_path = fs_get_cache_file(preset_fname); - common_download_opts opts; - opts.bearer_token = params.hf_token; - opts.offline = params.offline; - - LOG_TRC("%s: looking for remote preset at %s\n", __func__, preset_url.c_str()); - const int status = common_download_file_single(preset_url, preset_path, opts); - const bool has_preset = status >= 200 && status < 400; - - // remote preset is optional, so we don't error out if not found - if (has_preset) { - LOG_TRC("%s: applying remote preset from %s\n", __func__, preset_url.c_str()); - common_preset_context ctx(ex, /* only_remote_allowed */ true); - common_preset global; - auto remote_presets = ctx.load_from_ini(preset_path, global); - remote_presets = ctx.cascade(global, remote_presets); - if (remote_presets.find(hf_tag) != remote_presets.end()) { - common_preset preset = remote_presets.at(hf_tag); - LOG_INF("\n%s", preset.to_ini().c_str()); // to_ini already added trailing newline - preset.apply_to_params(params); - } else { - throw std::runtime_error("Remote preset.ini does not contain [" + std::string(hf_tag) + "] section"); - } - } else { - LOG_TRC("%s: no remote preset found, skipping\n", __func__); - } - - return has_preset; -} - struct handle_model_result { bool found_mmproj = false; common_params_model mmproj; bool found_mtp = false; common_params_model mtp; -}; - -static handle_model_result common_params_handle_model(struct common_params_model & model, - const common_download_opts & opts) { - handle_model_result result; - - if (!model.docker_repo.empty()) { - model.path = common_docker_resolve_model(model.docker_repo); - model.name = model.docker_repo; - } else if (!model.hf_repo.empty()) { - // If -m was used with -hf, treat the model "path" as the hf_file to download - if (model.hf_file.empty() && !model.path.empty()) { - model.hf_file = model.path; - model.path = ""; - } - common_download_opts hf_opts = opts; - auto download_result = common_download_model(model, hf_opts); - - if (download_result.model_path.empty()) { - throw std::runtime_error("failed to download model from Hugging Face"); - } - model.name = model.hf_repo; - model.path = download_result.model_path; - - if (!download_result.mmproj_path.empty()) { - result.found_mmproj = true; - result.mmproj.path = download_result.mmproj_path; - } - - if (!download_result.mtp_path.empty()) { - result.found_mtp = true; - result.mtp.path = download_result.mtp_path; - } - } else if (!model.url.empty()) { - if (model.path.empty()) { - auto f = string_split(model.url, '#').front(); - f = string_split(f, '?').front(); - model.path = fs_get_cache_file(string_split(f, '/').back()); - } - - auto download_result = common_download_model(model, opts); - if (download_result.model_path.empty()) { - throw std::runtime_error("failed to download model from " + model.url); - } - } - - return result; -} + bool found_preset = false; + std::string preset_path; +}; const std::vector kv_cache_types = { GGML_TYPE_F32, @@ -431,61 +341,241 @@ static bool parse_bool_value(const std::string & value) { } // -// CLI argument parsing functions +// common_models_handler // -bool common_params_handle_models(common_params & params, llama_example curr_ex) { - const bool spec_type_draft_mtp = std::find(params.speculative.types.begin(), - params.speculative.types.end(), - COMMON_SPECULATIVE_TYPE_DRAFT_MTP) != params.speculative.types.end(); +static std::string get_default_local_path(const std::string & url) { + auto f = string_split(url, '#').front(); + f = string_split(f, '?').front(); + return fs_get_cache_file(string_split(f, '/').back()); +} +common_models_handler common_models_handler_init(const common_params & params, llama_example curr_ex) { + common_download_hf_plan plan; + common_download_hf_plan plan_spec; + common_download_hf_plan plan_voc; common_download_opts opts; + + const bool spec_type_draft_mtp = std::find(params.speculative.types.begin(), + params.speculative.types.end(), + COMMON_SPECULATIVE_TYPE_DRAFT_MTP) != params.speculative.types.end(); + + // only download mmproj if the current example is using it + bool use_mmproj = false; + for (const auto & ex : mmproj_examples) { + if (curr_ex == ex) { + use_mmproj = true; + break; + } + } + opts.bearer_token = params.hf_token; opts.offline = params.offline; - opts.skip_download = params.skip_download; opts.download_mtp = spec_type_draft_mtp; - opts.download_mmproj = !params.no_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty(); + opts.download_mmproj = use_mmproj && !params.no_mmproj + && params.mmproj.path.empty() && params.mmproj.url.empty(); - // sub-models (draft, mmproj, vocoder) are explicitly specified by the user, - // so we should not auto-discover mtp/mmproj siblings for them - common_download_opts sub_opts = opts; - sub_opts.download_mtp = false; - sub_opts.download_mmproj = false; + if (!params.model.hf_repo.empty()) { + plan = common_download_get_hf_plan(params.model, opts); + } - try { - auto res = common_params_handle_model(params.model, opts); - if (params.no_mmproj) { - params.mmproj = {}; - } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) { - // optionally, handle mmproj model when -hf is specified - params.mmproj = res.mmproj; - } - // only download mmproj if the current example is using it - for (const auto & ex : mmproj_examples) { - if (curr_ex == ex) { - common_params_handle_model(params.mmproj, sub_opts); - break; + if (!params.speculative.draft.mparams.hf_repo.empty()) { + plan_spec = common_download_get_hf_plan(params.speculative.draft.mparams, opts); + } + + if (!params.vocoder.model.hf_repo.empty()) { + plan_voc = common_download_get_hf_plan(params.vocoder.model, opts); + } + + return common_models_handler{plan, plan_spec, plan_voc, opts}; +} + +bool common_models_handler_is_preset_repo(const common_models_handler & handler) { + return !handler.plan.preset.url.empty(); +} + +static std::vector build_url_tasks(const common_params_model & model, common_download_opts opts) { + auto parts = common_download_get_all_parts(model.url); + std::vector tasks; + + // single-part: download straight to model.path if the user gave one (-m), else the cache default + if (parts.size() == 1) { + common_download_task task; + task.url = parts[0]; + task.local_path = model.path.empty() ? get_default_local_path(parts[0]) : model.path; + task.opts = opts; + tasks.push_back(std::move(task)); + return tasks; + } + + // multi-part: place each part under the user's -m directory (if given), else the cache default + std::string base_dir; + if (!model.path.empty()) { + auto pos = model.path.rfind('/'); + base_dir = pos == std::string::npos ? std::string(".") : model.path.substr(0, pos); + } + + for (const auto & part : parts) { + common_download_task task; + task.url = part; + task.opts = opts; + + std::string local = get_default_local_path(part); + if (!base_dir.empty()) { + auto pos = local.rfind('/'); + std::string name = pos == std::string::npos ? local : local.substr(pos + 1); + local = base_dir + "/" + name; + } + task.local_path = local; + tasks.push_back(std::move(task)); + } + return tasks; +} + +void common_models_handler_apply(common_models_handler & handler, common_params & params, common_download_callback * callback) { + std::vector tasks; + + auto & plan = handler.plan; + auto & plan_spec = handler.plan_spec; + auto & plan_voc = handler.plan_voc; + + auto opts = handler.opts; // copy + opts.callback = callback; + + // handle plain "url" if needed + auto handle_url = [&](common_params_model & model) { + if (!model.url.empty()) { + if (model.path.empty()) { + model.path = get_default_local_path(model.url); } } + }; + handle_url(params.model); + handle_url(params.mmproj); + handle_url(params.vocoder.model); + handle_url(params.speculative.draft.mparams); + + // optionally, if docker repo is set, resolve it + if (!params.model.docker_repo.empty()) { + params.model.url = common_docker_resolve_model(params.model.docker_repo); + params.model.path = get_default_local_path(params.model.url); + } - // when --spec-type mtp is set and no draft model was provided explicitly, - // fall back to the MTP head discovered alongside the -hf model - if (spec_type_draft_mtp && res.found_mtp && - params.speculative.draft.mparams.path.empty() && - params.speculative.draft.mparams.hf_repo.empty() && - params.speculative.draft.mparams.url.empty()) { - params.speculative.draft.mparams.path = res.mtp.path; + // handle plain "url" tasks (non-hf) + if (!params.model.url.empty()) { + auto url_tasks = build_url_tasks(params.model, opts); + // the first part is what gets loaded, so point params.model.path at it + if (!url_tasks.empty()) { + std::string first_path = url_tasks.front().local_path; + url_tasks.front().on_done = [&]() { params.model.path = first_path; }; + } + for (auto & task : url_tasks) { + tasks.push_back(std::move(task)); + } + } + if (!params.mmproj.url.empty()) { + common_download_task task; + task.url = params.mmproj.url; + task.local_path = params.mmproj.path; + task.opts = opts; + tasks.push_back(task); + } + if (!params.vocoder.model.url.empty()) { + common_download_task task; + task.url = params.vocoder.model.url; + task.local_path = params.vocoder.model.path; + task.opts = opts; + tasks.push_back(task); + } + if (!params.speculative.draft.mparams.url.empty()) { + common_download_task task; + task.url = params.speculative.draft.mparams.url; + task.local_path = params.speculative.draft.mparams.path; + task.opts = opts; + tasks.push_back(task); + } + + // handle hf_plan tasks + auto add_tasks = [&opts, &tasks](const hf_cache::hf_files & model_files, common_params_model & model) { + for (size_t i = 0; i < model_files.size(); ++i) { + auto & model_file = model_files[i]; + bool is_first = (i == 0); + tasks.emplace_back(model_file, opts, [&, is_first]() { + if (is_first) { + // only use first part as model path + model.path = hf_cache::finalize_file(model_file); + } else { + hf_cache::finalize_file(model_file); + } + }); + } + }; + if (!plan.model_files.empty()) { + add_tasks(plan.model_files, params.model); + } + if (!plan.mmproj.local_path.empty()) { + tasks.emplace_back(plan.mmproj, opts, [&]() { + params.mmproj.path = hf_cache::finalize_file(plan.mmproj); + }); + } + if (!plan.mtp.local_path.empty()) { + tasks.emplace_back(plan.mtp, opts, [&]() { + // only fall back to the discovered MTP head when no draft was explicitly provided + if (params.speculative.draft.mparams.empty()) { + params.speculative.draft.mparams.path = hf_cache::finalize_file(plan.mtp); + } else { + hf_cache::finalize_file(plan.mtp); + } + }); + } + if (!plan.preset.local_path.empty()) { + tasks.emplace_back(plan.preset, opts, [&]() { + // if HF repo is a preset repo, we simply run server in router mode with the preset.ini file + params.models_preset_hf = params.model.hf_repo; // only for showing a warning + params.models_preset = hf_cache::finalize_file(plan.preset); + params.model = common_params_model{}; // make sure to clear model, so server starts in router mode + }); + } + + // handle plan_spec (e.g. --spec-draft-hf) + if (!plan_spec.model_files.empty()) { + add_tasks(plan_spec.model_files, params.speculative.draft.mparams); + } + + // handle vocoder plan (e.g. --hf-repo-v) + if (!plan_voc.model_files.empty()) { + add_tasks(plan_voc.model_files, params.vocoder.model); + } + + // run all tasks in parallel + if (!params.offline) { + // if duplicated files are found, only download once (but still call on_done for each task) + std::unordered_map unique_tasks; + for (auto & task : tasks) { + auto it = unique_tasks.find(task.local_path); + if (it == unique_tasks.end()) { + unique_tasks[task.local_path] = &task; + } + } + std::vector unique_tasks_vec; + for (auto & pair : unique_tasks) { + unique_tasks_vec.push_back(*pair.second); + } + common_download_run_tasks(unique_tasks_vec); + } + + // download successful, update params with the downloaded paths + for (const auto & task : tasks) { + if (task.on_done) { + task.on_done(); } - common_params_handle_model(params.speculative.draft.mparams, sub_opts); - common_params_handle_model(params.vocoder.model, sub_opts); - return true; - } catch (const common_skip_download_exception &) { - return false; - } catch (const std::exception &) { - throw; } } +// +// CLI argument parsing functions +// + static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) { common_params & params = ctx_arg.params; @@ -601,30 +691,6 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context // parse the first time to get -hf option (used for remote preset) parse_cli_args(); - // export_graph_ops loads only metadata - const bool skip_model_download = ctx_arg.ex == LLAMA_EXAMPLE_EXPORT_GRAPH_OPS; - - // maybe handle remote preset - if (!params.model.hf_repo.empty() && !skip_model_download) { - std::string cli_hf_repo = params.model.hf_repo; - bool has_preset = common_params_handle_remote_preset(params, ctx_arg.ex); - - // special case: if hf_repo explicitly set by preset, we need to preserve it (ignore CLI value) - // this is useful when we have one HF repo pointing to other HF repos (one model - multiple GGUFs) - std::string preset_hf_repo = params.model.hf_repo; - bool preset_has_hf_repo = preset_hf_repo != cli_hf_repo; - - if (has_preset) { - // re-parse CLI args to override preset values - parse_cli_args(); - } - - // preserve hf_repo from preset if needed - if (preset_has_hf_repo) { - params.model.hf_repo = preset_hf_repo; - } - } - postprocess_cpu_params(params.cpuparams, nullptr); postprocess_cpu_params(params.cpuparams_batch, ¶ms.cpuparams); @@ -635,15 +701,26 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n"); } - // handle model and download + const bool skip_model_download = + // server will call common_params_handle_models() later, so we skip it here + ctx_arg.ex == LLAMA_EXAMPLE_SERVER || + // download calls common_params_handle_models() itself and prints the paths + ctx_arg.ex == LLAMA_EXAMPLE_DOWNLOAD || + // export_graph_ops loads only metadata + ctx_arg.ex == LLAMA_EXAMPLE_EXPORT_GRAPH_OPS; + if (!skip_model_download) { - common_params_handle_models(params, ctx_arg.ex); - } + // handle model and download + common_models_handler handler = common_models_handler_init(params, ctx_arg.ex); + common_models_handler_apply(handler, params); - // model is required (except for server) - // TODO @ngxson : maybe show a list of available models in CLI in this case - if (params.model.path.empty() && ctx_arg.ex != LLAMA_EXAMPLE_SERVER && !skip_model_download && !params.usage && !params.completion) { - throw std::invalid_argument("error: --model is required\n"); + // model is required (except for server) + // TODO @ngxson : maybe show a list of available models in CLI in this case + if (params.model.path.empty() + && !params.usage + && !params.completion) { + throw std::invalid_argument("error: --model is required\n"); + } } if (params.escape) { @@ -707,15 +784,19 @@ static void common_params_print_usage(common_params_context & ctx_arg) { common_options.push_back(&opt); } } - printf("----- common params -----\n\n"); - print_options(common_options); - printf("\n\n----- sampling params -----\n\n"); - print_options(sampling_options); - printf("\n\n----- speculative params -----\n\n"); - print_options(spec_options); - // TODO: maybe convert enum llama_example to string - printf("\n\n----- example-specific params -----\n\n"); - print_options(specific_options); + bool first = true; + auto print_section = [&](const char * header, std::vector & options) { + if (options.empty()) { + return; + } + printf("%s----- %s -----\n\n", first ? "" : "\n\n", header); + first = false; + print_options(options); + }; + print_section("common params", common_options); + print_section("sampling params", sampling_options); + print_section("speculative params", spec_options); + print_section("example-specific params", specific_options); } static void common_params_print_completion(common_params_context & ctx_arg) { @@ -937,7 +1018,44 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map buf; + std::vector ptrs; +}; + +static utf8_argv make_utf8_argv() { + utf8_argv out; + int wargc = 0; + LPWSTR* wargv = CommandLineToArgvW(GetCommandLineW(), &wargc); + if (!wargv) return out; + + out.buf.reserve(wargc); + for (int i = 0; i < wargc; ++i) { + int n = WideCharToMultiByte(CP_UTF8, WC_ERR_INVALID_CHARS, wargv[i], -1, nullptr, 0, nullptr, nullptr); + if (n <= 0) { out.buf.emplace_back(); continue; } + auto& s = out.buf.emplace_back(); + s.resize(static_cast(n - 1)); + (void)WideCharToMultiByte(CP_UTF8, 0, wargv[i], -1, s.data(), n, nullptr, nullptr); + } + LocalFree(wargv); + + out.ptrs.reserve(out.buf.size() + 1); + for (auto& s : out.buf) out.ptrs.push_back(s.data()); + out.ptrs.push_back(nullptr); + return out; +} +#endif + bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) { +#ifdef _WIN32 + auto utf8 = make_utf8_argv(); + // repair argv only when it matches the process command line + if (static_cast(utf8.buf.size()) == argc) { + argv = utf8.ptrs.data(); + } +#endif + auto ctx_arg = common_params_parser_init(params, ex, print_usage); const common_params params_org = ctx_arg.params; // the example can modify the default params @@ -1078,7 +1196,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex * - if both {LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_*,} are set, we will prioritize the LLAMA_EXAMPLE_* matching current example */ auto add_opt = [&](common_arg arg) { - if ((arg.in_example(ex) || arg.in_example(LLAMA_EXAMPLE_COMMON)) && !arg.is_exclude(ex)) { + // download only exposes the handful of args explicitly tagged for it + const bool inherit_common = ex != LLAMA_EXAMPLE_DOWNLOAD; + if ((arg.in_example(ex) || (inherit_common && arg.in_example(LLAMA_EXAMPLE_COMMON))) && !arg.is_exclude(ex)) { ctx_arg.options.push_back(std::move(arg)); } }; @@ -1089,7 +1209,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params) { params.usage = true; } - )); + ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_DOWNLOAD})); add_opt(common_arg( {"--version"}, "show version and build info", @@ -2211,7 +2331,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, bool value) { params.no_mmproj = !value; } - ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_AUTO")); + ).set_examples({LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DOWNLOAD}).set_env("LLAMA_ARG_MMPROJ_AUTO")); add_opt(common_arg( {"--mmproj-offload"}, {"--no-mmproj-offload"}, @@ -2243,6 +2363,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.image_max_tokens = value; } ).set_examples(mmproj_examples).set_env("LLAMA_ARG_IMAGE_MAX_TOKENS")); + add_opt(common_arg( + {"--mtmd-batch-max-tokens"}, "N", + string_format("maximum number of image tokens per batch when encoding images (default: %d)", params.mtmd_batch_max_tokens), + [](common_params & params, int value) { + params.mtmd_batch_max_tokens = value; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MTMD_BATCH_MAX_TOKENS")); if (llama_supports_rpc()) { add_opt(common_arg( {"--rpc"}, "SERVERS", @@ -2603,14 +2730,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { params.model.path = value; } - ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL")); + ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA, LLAMA_EXAMPLE_DOWNLOAD}).set_env("LLAMA_ARG_MODEL")); add_opt(common_arg( {"-mu", "--model-url"}, "MODEL_URL", "model download url (default: unused)", [](common_params & params, const std::string & value) { params.model.url = value; } - ).set_env("LLAMA_ARG_MODEL_URL")); + ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_DOWNLOAD}).set_env("LLAMA_ARG_MODEL_URL")); add_opt(common_arg( { "-dr", "--docker-repo" }, "[/][:quant]", "Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.\n" @@ -2619,7 +2746,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { params.model.docker_repo = value; } - ).set_env("LLAMA_ARG_DOCKER_REPO")); + ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_DOWNLOAD}).set_env("LLAMA_ARG_DOCKER_REPO")); add_opt(common_arg( {"-hf", "-hfr", "--hf-repo"}, "/[:quant]", "Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n" @@ -2629,14 +2756,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { params.model.hf_repo = value; } - ).set_env("LLAMA_ARG_HF_REPO")); + ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_DOWNLOAD}).set_env("LLAMA_ARG_HF_REPO")); add_opt(common_arg( {"-hff", "--hf-file"}, "FILE", "Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)", [](common_params & params, const std::string & value) { params.model.hf_file = value; } - ).set_env("LLAMA_ARG_HF_FILE")); + ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_DOWNLOAD}).set_env("LLAMA_ARG_HF_FILE")); add_opt(common_arg( {"-hfv", "-hfrv", "--hf-repo-v"}, "/[:quant]", "Hugging Face model repository for the vocoder model (default: unused)", @@ -2657,7 +2784,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params, const std::string & value) { params.hf_token = value; } - ).set_env("HF_TOKEN")); + ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_DOWNLOAD}).set_env("HF_TOKEN")); + add_opt(common_arg( + {"--mtp"}, + "also download the multi-token prediction (MTP) head, if available (default: unused)", + [](common_params & params) { + params.speculative.types.push_back(COMMON_SPECULATIVE_TYPE_DRAFT_MTP); + } + ).set_examples({LLAMA_EXAMPLE_DOWNLOAD})); add_opt(common_arg( {"--context-file"}, "FNAME", "file to load context from (use comma-separated values to specify multiple files)", @@ -2867,62 +3001,26 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.api_prefix = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX")); - // Deprecated: use --ui-config instead (kept for backward compat) - add_opt(common_arg( - {"--webui-config"}, "JSON", - "[DEPRECATED: use --ui-config] JSON that provides default WebUI settings (overrides WebUI defaults)", - [](common_params & params, const std::string & value) { - params.ui_config_json = value; - params.webui_config_json = value; - } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG")); - add_opt(common_arg( - {"--ui-config"}, "JSON", + {"--ui-config", "--webui-config"}, "JSON", "JSON that provides default UI settings (overrides UI defaults)", [](common_params & params, const std::string & value) { params.ui_config_json = value; - params.webui_config_json = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_UI_CONFIG")); - - // Deprecated: use --ui-config-file instead (kept for backward compat) - add_opt(common_arg( - {"--webui-config-file"}, "PATH", - "[DEPRECATED: use --ui-config-file] JSON file that provides default WebUI settings (overrides WebUI defaults)", - [](common_params & params, const std::string & value) { - params.ui_config_json = read_file(value); - params.webui_config_json = params.ui_config_json; - } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_CONFIG_FILE")); - add_opt(common_arg( - {"--ui-config-file"}, "PATH", + {"--ui-config-file", "--webui-config-file"}, "PATH", "JSON file that provides default UI settings (overrides UI defaults)", [](common_params & params, const std::string & value) { params.ui_config_json = read_file(value); - params.webui_config_json = params.ui_config_json; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_UI_CONFIG_FILE")); - - // Deprecated: use --ui-mcp-proxy instead (kept for backward compat) add_opt(common_arg( - {"--webui-mcp-proxy"}, - {"--no-webui-mcp-proxy"}, - "[DEPRECATED: use --ui-mcp-proxy/--no-ui-mcp-proxy] experimental: whether to enable MCP CORS proxy", - [](common_params & params, bool value) { - params.ui_mcp_proxy = value; - params.webui_mcp_proxy = value; - } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI_MCP_PROXY")); - - add_opt(common_arg( - {"--ui-mcp-proxy"}, - {"--no-ui-mcp-proxy"}, + {"--ui-mcp-proxy", "--webui-mcp-proxy"}, + {"--no-ui-mcp-proxy", "--no-webui-mcp-proxy"}, "experimental: whether to enable MCP CORS proxy - do not enable in untrusted environments (default: disabled)", [](common_params & params, bool value) { params.ui_mcp_proxy = value; - params.webui_mcp_proxy = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_UI_MCP_PROXY")); add_opt(common_arg( @@ -2934,24 +3032,26 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.server_tools = parse_csv_row(value); } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_TOOLS")); - // Deprecated: use --ui/--no-ui instead (kept for backward compat) add_opt(common_arg( - {"--webui"}, - {"--no-webui"}, - "[DEPRECATED: use --ui/--no-ui] whether to enable the Web UI", + {"-ag", "--agent"}, + {"-no-ag", "--no-agent"}, + "whether to enable CORS proxy and all built-in tools - do not enable in untrusted environments (default: disabled)", [](common_params & params, bool value) { - params.ui = value; - params.webui = value; + if (value) { + params.server_tools = {"all"}; + params.ui_mcp_proxy = true; + } else { + params.server_tools.clear(); + params.ui_mcp_proxy = false; + } } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI")); - + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_AGENT")); add_opt(common_arg( - {"--ui"}, - {"--no-ui"}, + {"--ui", "--webui"}, + {"--no-ui", "--no-webui"}, string_format("whether to enable the Web UI (default: %s)", params.ui ? "enabled" : "disabled"), [](common_params & params, bool value) { params.ui = value; - params.webui = value; } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_UI")); add_opt(common_arg( @@ -2982,7 +3082,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_API_KEY")); add_opt(common_arg( {"--api-key-file"}, "FNAME", - "path to file containing API keys (default: none)", + "path to file containing API keys, one per line; lines starting with a hash are treated as comments (default: none)", [](common_params & params, const std::string & value) { std::ifstream key_file(value); if (!key_file) { @@ -2990,7 +3090,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } std::string key; while (std::getline(key_file, key)) { - if (!key.empty()) { + if (!key.empty() && key[0] != '#') { params.api_keys.push_back(key); } } @@ -3648,6 +3748,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex "draft model for speculative decoding (default: unused)", [](common_params & params, const std::string & value) { params.speculative.draft.mparams.path = value; + params.speculative.draft.mparams.hf_file = value; // will be used if --spec-draft-hf is set } ).set_spec().set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SPEC_DRAFT_MODEL")); add_opt(common_arg( diff --git a/common/arg.h b/common/arg.h index 0010f2a9ac9c..54a38b9cce4a 100644 --- a/common/arg.h +++ b/common/arg.h @@ -1,12 +1,14 @@ #pragma once #include "common.h" +#include "download.h" #include #include #include #include #include +#include // pseudo-env variable to identify preset-only arguments #define COMMON_ARG_PRESET_LOAD_ON_STARTUP "__PRESET_LOAD_ON_STARTUP" @@ -129,11 +131,21 @@ bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map & args); -// populate model paths (main model, mmproj, etc) from -hf if necessary -// return true if the model is ready to use -// throw an exception if there is an error that prevents the model from being used (e.g. network error, model not found, etc) -// if params.skip_download is true, no downloads will be attempted. return false if the model is invalid or missing (e.g. ETag check failed) -bool common_params_handle_models(common_params & params, llama_example curr_ex); +struct common_models_handler { + common_download_hf_plan plan; + common_download_hf_plan plan_spec; + common_download_hf_plan plan_voc; + common_download_opts opts; +}; + +// initialize downloading opts and hf_plan if needed, but does not download anything yet +common_models_handler common_models_handler_init(const common_params & params, llama_example curr_ex); + +// check if the model is a preset repo (i.e. has a preset file) +bool common_models_handler_is_preset_repo(const common_models_handler & handler); + +// download and update params with the downloaded model path +void common_models_handler_apply(common_models_handler & handler, common_params & params, common_download_callback * callback = nullptr); // initialize argument parser context - used by test-arg-parser and preset common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr); diff --git a/common/chat-auto-parser-generator.cpp b/common/chat-auto-parser-generator.cpp index db3a6cc6fe38..36aab7ecbe25 100644 --- a/common/chat-auto-parser-generator.cpp +++ b/common/chat-auto-parser-generator.cpp @@ -103,6 +103,10 @@ common_chat_params peg_generator::generate_parser(const common_chat_template & data.grammar_triggers = { { COMMON_GRAMMAR_TRIGGER_TYPE_WORD, trigger_marker } }; + if (autoparser.tools.format.openai_wrapper_trigger) { + // model emits the OpenAI function wrapper, trigger on it + data.grammar_triggers.push_back({ COMMON_GRAMMAR_TRIGGER_TYPE_WORD, "{\"type\": \"function\"," }); + } } } @@ -134,7 +138,7 @@ common_peg_arena autoparser::build_parser(const generation_params & inputs, cons auto response_format = p.rule("response-format", p.content(p.schema(p.json(), "response-format-schema", inputs.json_schema))); parser = ctx.reasoning_parser + p.space() + p.choice({ p.literal("```json") + p.space() + response_format + p.space() + p.literal("```"), - response_format + p.space() + response_format + p.space() }) + p.end(); pure_content = false; } else if (has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && jinja_caps.supports_tool_calls) { @@ -224,13 +228,13 @@ common_peg_parser analyze_tools::build_tool_parser_json_native(parser_build_cont auto single_tool_parser = p.standard_json_tools( format.per_call_start, format.per_call_end, inputs.tools, inputs.parallel_tool_calls, inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED, name_field, args_field, format.tools_array_wrapped, - format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order); + format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order, format.openai_wrapper_trigger); tools_parser = p.trigger_rule("tool-calls", p.one_or_more(single_tool_parser + p.space())); } else { tools_parser = p.standard_json_tools( format.section_start, format.section_end, inputs.tools, inputs.parallel_tool_calls, inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED, name_field, args_field, format.tools_array_wrapped, - format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order); + format.fun_name_is_key, format.id_field, format.gen_id_field, format.parameter_order, format.openai_wrapper_trigger); } // Handle content wrappers if present @@ -391,11 +395,11 @@ common_peg_parser analyze_tools::build_tool_parser_tag_tagged(parser_build_conte arguments.name_suffix) + arguments.value_prefix + (schema_info.resolves_to_string(param_schema) ? - p.tool_arg_string_value(until_suffix) : - p.tool_arg_json_value(p.schema( + p.ac(p.tool_arg_string_value(until_suffix) + + p.tool_arg_close(p.literal(arguments.value_suffix)), arguments.value_suffix) : + (p.tool_arg_json_value(p.schema( p.json(), "tool-" + name + "-arg-" + param_name + "-schema", param_schema, false)) + - p.space()) + - p.tool_arg_close(p.literal(arguments.value_suffix))); + p.tool_arg_close(p.literal(arguments.value_suffix))))); auto named_arg = p.rule("tool-" + name + "-arg-" + param_name, arg); if (is_required) { diff --git a/common/chat-auto-parser.h b/common/chat-auto-parser.h index 7858f6572f2a..9e8113f24421 100644 --- a/common/chat-auto-parser.h +++ b/common/chat-auto-parser.h @@ -181,6 +181,7 @@ struct tool_format_analysis { bool fun_name_is_key = false; // In JSON format function name is JSON key, i.e. { "": { ... arguments ... } } bool tools_array_wrapped = false; // Tool calls wrapped in JSON array [...] + bool openai_wrapper_trigger = false; // model emits the OpenAI function wrapper, trigger on it std::string function_field = "function"; std::string name_field = "name"; diff --git a/common/chat-diff-analyzer.cpp b/common/chat-diff-analyzer.cpp index 0875c5347f4f..b166ee5a18f3 100644 --- a/common/chat-diff-analyzer.cpp +++ b/common/chat-diff-analyzer.cpp @@ -165,6 +165,14 @@ static std::vector void { + if (tmpl.src.find("Respond in the format {\"name\": function name") != std::string::npos && + tmpl.src.find("Do not use variables.") != std::string::npos) { + analysis.tools.format.openai_wrapper_trigger = true; + LOG_DBG(ANSI_ORANGE "[Patch: JSON name/parameters tool instruction]\n" ANSI_RESET); + } + }, }); @@ -1229,8 +1237,8 @@ void analyze_tools::extract_argument_name_markers() { left_result.tags["pre"] == right_result.tags["pre"] && left_result.tags["suffix"] == right_result.tags["suffix"]) { // Name is inside a structure (e.g., JSON key): prefix is the shared wrapper - arguments.name_prefix = trim_whitespace(left_result.tags["pre"]); - arguments.name_suffix = trim_leading_whitespace(left_result.tags["suffix"]); + arguments.name_prefix = left_result.tags["pre"]; + arguments.name_suffix = left_result.tags["suffix"]; } else if (diff.left.substr(0, ARG_FIRST.length()) == ARG_FIRST && diff.right.substr(0, ARG_SECOND.length()) == ARG_SECOND) { // Name is directly in the diff: prefix comes from last marker in diff.prefix auto pre_parser = build_tagged_peg_parser([&](common_peg_parser_builder & p) { @@ -1315,8 +1323,7 @@ void analyze_tools::extract_argument_value_markers() { value_suffix = value_suffix.substr(0, end_marker_pos); } } - value_suffix = trim_leading_whitespace(value_suffix); - if (!value_suffix.empty()) { + if (!trim_whitespace(value_suffix).empty()) { arguments.value_suffix = value_suffix; } } diff --git a/common/chat-peg-parser.cpp b/common/chat-peg-parser.cpp index 9bc5ac98be69..a309f02765b7 100644 --- a/common/chat-peg-parser.cpp +++ b/common/chat-peg-parser.cpp @@ -363,7 +363,7 @@ void common_chat_peg_mapper::map(const common_peg_ast_node & node) { } if ((is_arg_value || is_arg_string_value) && current_tool) { - std::string value_content = std::string(trim_trailing_space(trim_leading_space(node.text, 1), 1)); + std::string value_content = std::string(node.text); std::string value_to_add; if (value_content.empty() && is_arg_string_value) { @@ -540,10 +540,11 @@ common_peg_parser common_chat_peg_builder::python_style_tool_calls( auto arg_name_parser = literal(prop_name); common_peg_parser arg_value_parser = eps(); - auto string_value_parser = choice({ - literal("\"") + tool_arg_string_value(string_content('"')) + literal("\""), - literal("'") + tool_arg_string_value(string_content('\'')) + literal("'") - }); + // Quoted literal as a value: normalize_quotes_to_json preserves escapes. + auto string_value_parser = tool_arg_value(choice({ + literal("\"") + string_content('"') + literal("\""), + literal("'") + string_content('\'') + literal("'") + })); if (is_string_type) { arg_value_parser = string_value_parser; @@ -745,7 +746,8 @@ common_peg_parser common_chat_peg_builder::build_json_tools_flat_keys( const std::string & effective_args_key, const std::string & call_id_key, const std::string & gen_call_id_key, - const std::vector & parameters_order) { + const std::vector & parameters_order, + bool accept_openai_wrapper) { auto tool_choices = choice(); auto name_key_parser = literal("\"" + effective_name_key + "\""); @@ -807,7 +809,13 @@ common_peg_parser common_chat_peg_builder::build_json_tools_flat_keys( return idx_a < idx_b; }); - auto ordered_body = tool_open(literal("{")) + space(); + // accept an optional leading "type": "function" field when the model emits the OpenAI wrapper + common_peg_parser type_field = eps(); + if (accept_openai_wrapper) { + type_field = optional(literal("\"type\"") + space() + literal(":") + space() + + literal("\"function\"") + space() + literal(",") + space()); + } + auto ordered_body = tool_open(literal("{")) + space() + type_field; for (size_t i = 0; i < parser_pairs.size(); i++) { ordered_body = ordered_body + parser_pairs[i].first; if (i < parser_pairs.size() - 1) { @@ -870,7 +878,8 @@ common_peg_parser common_chat_peg_builder::standard_json_tools( bool function_is_key, const std::string & call_id_key, const std::string & gen_call_id_key, - const std::vector & parameters_order) { + const std::vector & parameters_order, + bool accept_openai_wrapper) { if (!tools.is_array() || tools.empty()) { return eps(); } @@ -888,7 +897,7 @@ common_peg_parser common_chat_peg_builder::standard_json_tools( if (!name_spec.first.empty() || !args_spec.first.empty()) { tool_choices = build_json_tools_nested_keys(tools, effective_name_key, effective_args_key, call_id_key, gen_call_id_key); } else { - tool_choices = build_json_tools_flat_keys(tools, effective_name_key, effective_args_key, call_id_key, gen_call_id_key, parameters_order); + tool_choices = build_json_tools_flat_keys(tools, effective_name_key, effective_args_key, call_id_key, gen_call_id_key, parameters_order, accept_openai_wrapper); } } diff --git a/common/chat-peg-parser.h b/common/chat-peg-parser.h index a4643fbea869..b3ffd7de2dd8 100644 --- a/common/chat-peg-parser.h +++ b/common/chat-peg-parser.h @@ -120,7 +120,8 @@ class common_chat_peg_builder : public common_peg_parser_builder { bool function_is_key = false, const std::string & call_id_key = "", const std::string & gen_call_id_key = "", - const std::vector & parameters_order = {}); + const std::vector & parameters_order = {}, + bool accept_openai_wrapper = false); // Legacy-compatible helper for building XML/tagged style tool calls // Used by tests and manual parsers @@ -157,7 +158,8 @@ class common_chat_peg_builder : public common_peg_parser_builder { const std::string & effective_args_key, const std::string & call_id_key, const std::string & gen_call_id_key, - const std::vector & parameters_order); + const std::vector & parameters_order, + bool accept_openai_wrapper); }; inline common_peg_arena build_chat_peg_parser( diff --git a/common/chat.cpp b/common/chat.cpp index 9639af9ca466..0cee80434ece 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -90,41 +90,93 @@ std::string common_chat_msg::render_content(const std::string & delimiter) const return text; } -std::vector common_chat_split_by_role(const std::string & prompt, const std::vector & delims) { - if (delims.empty() || prompt.empty()) { - return {}; +common_chat_role common_chat_role_from_string(const std::string & role) { + if (role == "system") { return COMMON_CHAT_ROLE_SYSTEM; } + if (role == "assistant") { return COMMON_CHAT_ROLE_ASSISTANT; } + if (role == "user") { return COMMON_CHAT_ROLE_USER; } + if (role == "tool") { return COMMON_CHAT_ROLE_TOOL; } + return COMMON_CHAT_ROLE_UNKNOWN; +} + +const char * common_chat_role_to_string(common_chat_role role) { + switch (role) { + case COMMON_CHAT_ROLE_SYSTEM: return "system"; + case COMMON_CHAT_ROLE_ASSISTANT: return "assistant"; + case COMMON_CHAT_ROLE_USER: return "user"; + case COMMON_CHAT_ROLE_TOOL: return "tool"; + case COMMON_CHAT_ROLE_UNKNOWN: return ""; } + return ""; +} - auto parser = build_peg_parser([&](common_peg_parser_builder & p) { - std::vector all_delims; - std::vector tagged_messages; +json common_chat_msg_delimiters::to_json() const { + json result = json::array(); + for (const auto & d : delimiters) { + result.push_back({ + { "role", common_chat_role_to_string(d.role) }, + { "delimiter", d.delimiter }, + }); + } + return result; +} - all_delims.reserve(delims.size()); - tagged_messages.reserve(delims.size()); - for (const auto & d : delims) { - all_delims.push_back(d.delimiter); - } +common_chat_msg_delimiters common_chat_msg_delimiters_parse(const json & delimiters) { + common_chat_msg_delimiters result; - auto any_delim = p.until_one_of(all_delims); - for (const auto & d : delims) { - tagged_messages.push_back(p.tag(d.role, p.literal(d.delimiter) + any_delim)); + if (!delimiters.is_array()) { + return result; + } + + result.delimiters.reserve(delimiters.size()); + for (const auto & d : delimiters) { + if (!d.is_object()) { + continue; } + result.delimiters.push_back({ + common_chat_role_from_string(d.value("role", std::string())), + d.value("delimiter", std::string()), + }); + } - return any_delim + p.zero_or_more(p.choice(tagged_messages)) + p.end(); - }); + return result; +} - common_peg_parse_context ctx(prompt); - const auto result = parser.parse(ctx); - if (!result.success()) { - return {}; +void common_chat_msg_delimiters::tokenize(const llama_vocab * vocab) { + for (auto & d : delimiters) { + d.tokens = common_tokenize(vocab, d.delimiter, false, true); } +} + +common_chat_msg_spans common_chat_msg_delimiters::split(const llama_tokens & tokens, const std::map & skips) const { + std::vector> matches; - std::vector spans; - ctx.ast.visit(result, [&](const common_peg_ast_node & node) { - if (!node.tag.empty()) { - spans.push_back({ node.tag, node.start, node.end - node.start }); + auto skip = skips.begin(); + for (size_t i = 0; i < tokens.size();) { + if (skip != skips.end() && i == skip->first) { + i += skip->second; + ++skip; + continue; } - }); + for (const auto & d : delimiters) { + if (i + d.tokens.size() > tokens.size()) { + continue; + } + if (std::equal(d.tokens.begin(), d.tokens.end(), tokens.begin() + i)) { + matches.emplace_back(d.role, i); + break; + } + } + i++; + } + + matches.emplace_back(COMMON_CHAT_ROLE_UNKNOWN, tokens.size()); + + common_chat_msg_spans spans; + for (size_t i = 0; i + 1 < matches.size(); i++) { + const auto & curr = matches[i]; + const auto & next = matches[i + 1]; + spans.add(curr.first, curr.second, next.second - curr.second); + } return spans; } @@ -1081,13 +1133,13 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp data.prompt = prompt; data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs, /* messages_override= */ adjusted_messages); - data.message_spans = common_chat_split_by_role(prompt, { - { "assistant", "<|start|>assistant" }, - { "user", "<|start|>user" }, - { "system", "<|start|>developer" }, - { "system", "<|start|>system" }, - { "tool", "<|start|>functions" }, - }); + data.message_delimiters = { + { COMMON_CHAT_ROLE_ASSISTANT, "<|start|>assistant" }, + { COMMON_CHAT_ROLE_USER, "<|start|>user" }, + { COMMON_CHAT_ROLE_SYSTEM, "<|start|>developer" }, + { COMMON_CHAT_ROLE_SYSTEM, "<|start|>system" }, + { COMMON_CHAT_ROLE_TOOL, "<|start|>functions" }, + }; data.format = COMMON_CHAT_FORMAT_PEG_NATIVE; data.supports_thinking = true; @@ -1228,10 +1280,10 @@ static common_chat_params common_chat_params_init_gemma4(const common_chat_templ data.prompt += data.generation_prompt; } - data.message_spans = common_chat_split_by_role(data.prompt, { - { "user", "<|turn>user\n" }, - { "assistant", "<|turn>model\n" }, - }); + data.message_delimiters = { + { COMMON_CHAT_ROLE_USER, "<|turn>user" }, + { COMMON_CHAT_ROLE_ASSISTANT, "<|turn>model" }, + }; data.format = COMMON_CHAT_FORMAT_PEG_GEMMA4; data.supports_thinking = true; @@ -1979,6 +2031,146 @@ static common_chat_params common_chat_params_init_deepseek_v3_2(const common_cha return data; } +// Cohere2 MoE (a.k.a. "North Code") parser. +// +// The assistant turn is fully marker-wrapped: +// <|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|> +// <|START_THINKING|>{reasoning}<|END_THINKING|> +// then EITHER content: <|START_TEXT|>{content}<|END_TEXT|> +// OR tool calls: <|START_ACTION|>[ +// {"tool_call_id": "0", "tool_name": "f", "parameters": {...}}, ... +// ]<|END_ACTION|> +// <|END_OF_TURN_TOKEN|> +// +// The generation prompt forces a leading <|START_THINKING|> (when reasoning is enabled, which is +// the template default), so the model's output continues from *inside* the thinking block. The +// parser literal therefore only covers the stable <|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|> prefix +// and the reasoning rule consumes the <|START_THINKING|> ... <|END_THINKING|> markers itself, +// regardless of whether they came from the generation prompt or the generated text. +static common_chat_params common_chat_params_init_cohere2moe(const common_chat_template & tmpl, + const autoparser::generation_params & inputs) { + common_chat_params data; + + const std::string TURN_START = "<|START_OF_TURN_TOKEN|>"; + const std::string TURN_END = "<|END_OF_TURN_TOKEN|>"; + const std::string CHATBOT = "<|CHATBOT_TOKEN|>"; + const std::string USER = "<|USER_TOKEN|>"; + const std::string SYSTEM = "<|SYSTEM_TOKEN|>"; + const std::string THINK_START = "<|START_THINKING|>"; + const std::string THINK_END = "<|END_THINKING|>"; + const std::string TEXT_START = "<|START_TEXT|>"; + const std::string TEXT_END = "<|END_TEXT|>"; + const std::string ACTION_START = "<|START_ACTION|>"; + const std::string ACTION_END = "<|END_ACTION|>"; + const std::string RESULT_START = "<|START_TOOL_RESULT|>"; + const std::string RESULT_END = "<|END_TOOL_RESULT|>"; + + // Stable prefix of the generation prompt that precedes the (forced) <|START_THINKING|> marker. + const std::string GEN_PREFIX = TURN_START + CHATBOT; + + data.prompt = common_chat_template_direct_apply_impl(tmpl, inputs); + data.generation_prompt = common_chat_template_generation_prompt_impl(tmpl, inputs); + data.format = COMMON_CHAT_FORMAT_PEG_NATIVE; + data.supports_thinking = true; + data.thinking_start_tag = THINK_START; + data.thinking_end_tag = THINK_END; + data.preserved_tokens = { + TURN_START, TURN_END, CHATBOT, USER, SYSTEM, + THINK_START, THINK_END, + TEXT_START, TEXT_END, + ACTION_START, ACTION_END, + RESULT_START, RESULT_END, + }; + + // Declare per-role message delimiters. Tool results are rendered with the + // system token followed by <|START_TOOL_RESULT|>, so the "tool" delimiter must be listed before + // the plain "system" one (it is a strict superset, and the role split tries delimiters in order). + data.message_delimiters = { + { COMMON_CHAT_ROLE_ASSISTANT, GEN_PREFIX }, + { COMMON_CHAT_ROLE_USER, TURN_START + USER }, + { COMMON_CHAT_ROLE_TOOL, TURN_START + SYSTEM + RESULT_START }, + { COMMON_CHAT_ROLE_SYSTEM, TURN_START + SYSTEM }, + }; + + auto has_tools = inputs.tools.is_array() && !inputs.tools.empty(); + auto extract_reasoning = inputs.reasoning_format != COMMON_REASONING_FORMAT_NONE; + auto include_grammar = has_tools && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE; + + if (inputs.has_continuation()) { + const auto & msg = inputs.continue_msg; + + data.generation_prompt = GEN_PREFIX + THINK_START + msg.reasoning_content; + if (inputs.continue_final_message == COMMON_CHAT_CONTINUATION_CONTENT) { + data.generation_prompt += THINK_END + TEXT_START + msg.render_content(); + } + + data.prompt += data.generation_prompt; + } + + auto parser = build_chat_peg_parser([&](common_chat_peg_builder & p) { + auto generation_prompt = p.literal(GEN_PREFIX); + auto end = p.end(); + + // The thinking block is always present (the generation prompt forces <|START_THINKING|>). + // When extracting reasoning, capture its body; otherwise keep the whole block (markers + // included) inline as content, matching reasoning_format=NONE conventions. + common_peg_parser reasoning = p.eps(); + if (extract_reasoning) { + reasoning = p.optional(p.literal(THINK_START) + + p.reasoning(p.until_one_of({ THINK_END, TEXT_START, ACTION_START })) + + p.optional(p.literal(THINK_END))); + } else { + reasoning = p.optional(p.content(p.literal(THINK_START) + + p.until_one_of({ THINK_END, TEXT_START, ACTION_START }) + + p.optional(p.literal(THINK_END)))); + } + + auto text_content = p.literal(TEXT_START) + p.content(p.until(TEXT_END)) + p.optional(p.literal(TEXT_END)); + + if (!has_tools || inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_NONE) { + return generation_prompt + reasoning + text_content + p.optional(p.literal(TURN_END)) + end; + } + + auto require_tools = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_REQUIRED; + + // <|START_ACTION|>[ {"tool_call_id": "0", "tool_name": "f", "parameters": {...}}, ... ]<|END_ACTION|> + auto tool_calls = p.standard_json_tools(ACTION_START, ACTION_END, inputs.tools, inputs.parallel_tool_calls, + /* force_tool_calls = */ true, + /* name_key = */ "tool_name", + /* args_key = */ "parameters", + /* array_wrapped = */ true, + /* function_is_key = */ false, + /* call_id_key = */ "", + /* gen_call_id_key = */ "tool_call_id", + /* parameters_order = */ { "tool_call_id", "tool_name", "parameters" }); + + // Content and tool calls are mutually exclusive in this format. + common_peg_parser body = require_tools ? tool_calls : p.choice({ tool_calls, text_content }); + + return generation_prompt + reasoning + body + p.optional(p.literal(TURN_END)) + end; + }); + + data.parser = parser.save(); + + if (include_grammar) { + data.grammar_lazy = inputs.tool_choice == COMMON_CHAT_TOOL_CHOICE_AUTO; + data.grammar = build_grammar([&](const common_grammar_builder & builder) { + foreach_function(inputs.tools, [&](const json & tool) { + const auto & function = tool.at("function"); + auto schema = function.at("parameters"); + builder.resolve_refs(schema); + }); + parser.build_grammar(builder, data.grammar_lazy); + }); + + data.grammar_triggers = { + { COMMON_GRAMMAR_TRIGGER_TYPE_WORD, ACTION_START } + }; + } + + return data; +} + namespace workaround { static void map_developer_role_to_system(json & messages) { @@ -2227,6 +2419,15 @@ std::optional common_chat_try_specialized_template( return common_chat_params_init_kimi_k2(tmpl, params); } + // Cohere2 MoE / North Code - marker-wrapped format with <|START_TEXT|> content and + // <|START_ACTION|> JSON tool calls. <|START_TEXT|> is unique to this template (the older + // Command-R templates use <|START_RESPONSE|>). + if (src.find("<|START_TEXT|>") != std::string::npos && + src.find("<|START_ACTION|>") != std::string::npos) { + LOG_DBG("Using specialized template: Cohere2 MoE\n"); + return common_chat_params_init_cohere2moe(tmpl, params); + } + if (is_lfm2_template(src)) { LOG_DBG("Using specialized template: LFM2\n"); return common_chat_params_init_lfm2(tmpl, params, /* tool_list_tokens = */ true); @@ -2377,17 +2578,15 @@ static common_chat_params common_chat_templates_apply_jinja(const struct common_ autoparser.analyze_template(tmpl); auto auto_params = autoparser::peg_generator::generate_parser(tmpl, params, autoparser); - std::vector delimiters; + common_chat_msg_delimiters delimiters; if (!autoparser.assistant_start.empty()) { - delimiters.push_back({ "assistant", autoparser.assistant_start }); + delimiters.add(COMMON_CHAT_ROLE_ASSISTANT, autoparser.assistant_start); } if (!autoparser.user_start.empty()) { - delimiters.push_back({ "user", autoparser.user_start }); + delimiters.add(COMMON_CHAT_ROLE_USER, autoparser.user_start); } - if (!delimiters.empty()) { - auto_params.message_spans = common_chat_split_by_role(auto_params.prompt, delimiters); - } + auto_params.message_delimiters = std::move(delimiters); auto_params.supports_thinking = autoparser.reasoning.mode != autoparser::reasoning_mode::NONE; if (auto_params.supports_thinking) { @@ -2529,8 +2728,9 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena & src_pars } return msg; } - throw std::runtime_error(std::string("Failed to parse input at pos ") + std::to_string(result.end) + ": " + - effective_input.substr(result.end)); + LOG_WRN("%s: unparsed %s output: %s\n", __func__, common_chat_format_name(params.format), effective_input.substr(result.end).c_str()); + LOG_DBG("%s: full %s output triggering error:\n=== BEGIN ===\n%s\n=== END ===\n", __func__, common_chat_format_name(params.format), effective_input.c_str()); + throw std::runtime_error(std::string("The model produced output that does not match the expected ") + common_chat_format_name(params.format) + " format"); } common_chat_msg msg; @@ -2558,5 +2758,9 @@ common_chat_msg common_chat_peg_parse(const common_peg_arena & src_pars std::map common_chat_templates_get_caps(const common_chat_templates * chat_templates) { GGML_ASSERT(chat_templates != nullptr); GGML_ASSERT(chat_templates->template_default != nullptr); + if (chat_templates->template_tool_use != nullptr) { + // take the more expressive template when available + return chat_templates->template_tool_use->caps.to_map(); + } return chat_templates->template_default->caps.to_map(); } diff --git a/common/chat.h b/common/chat.h index 5659cd42a07c..7898f1623f54 100644 --- a/common/chat.h +++ b/common/chat.h @@ -143,15 +143,75 @@ struct common_chat_msg_diff { } }; +enum common_chat_role { + COMMON_CHAT_ROLE_UNKNOWN, + COMMON_CHAT_ROLE_SYSTEM, + COMMON_CHAT_ROLE_ASSISTANT, + COMMON_CHAT_ROLE_USER, + COMMON_CHAT_ROLE_TOOL +}; + +common_chat_role common_chat_role_from_string(const std::string & role); +const char * common_chat_role_to_string(common_chat_role role); + struct common_chat_msg_span { - std::string role; + common_chat_role role = COMMON_CHAT_ROLE_UNKNOWN; std::size_t pos = 0; std::size_t len = 0; + + bool valid() const { + return role != COMMON_CHAT_ROLE_UNKNOWN; + } +}; + +struct common_chat_msg_spans { + std::vector spans; + + void add(common_chat_role role, size_t pos, size_t len) { + spans.push_back({ role, pos, len }); + } + + bool is_user_start(int32_t pos) const { + for (auto it = spans.begin(); it != spans.end(); ++it) { + if (it->role == COMMON_CHAT_ROLE_USER && pos == (int32_t) it->pos) { + return true; + } + } + return false; + } + + int32_t last_user_message_pos() const { + for (auto it = spans.rbegin(); it != spans.rend(); ++it) { + if (it->role == COMMON_CHAT_ROLE_USER) { + return (int32_t) it->pos; + } + } + return -1; + } }; struct common_chat_msg_delimiter { - std::string role; - std::string delimiter; + common_chat_role role = COMMON_CHAT_ROLE_UNKNOWN; + std::string delimiter; + llama_tokens tokens = {}; +}; + +struct common_chat_msg_delimiters { + std::vector delimiters; + + common_chat_msg_delimiters() = default; + common_chat_msg_delimiters(std::initializer_list delims) : delimiters(delims) {} + + void add(common_chat_role role, const std::string & delimiter) { + delimiters.push_back({ role, delimiter }); + } + + void tokenize(const llama_vocab * vocab); + + // split tokens into message spans. skips maps a start index to a length of a region to jump over without matching + common_chat_msg_spans split(const llama_tokens & tokens, const std::map & skips = {}) const; + + nlohmann::ordered_json to_json() const; }; struct common_chat_tool { @@ -219,7 +279,7 @@ struct common_chat_params { std::vector preserved_tokens; std::vector additional_stops; std::string parser; - std::vector message_spans; + common_chat_msg_delimiters message_delimiters; }; // per-message parsing syntax @@ -325,5 +385,4 @@ struct common_chat_prompt_preset { common_chat_prompt_preset common_chat_get_asr_prompt(const common_chat_templates * chat_templates); -std::vector common_chat_split_by_role(const std::string & prompt, const std::vector & delims); - +common_chat_msg_delimiters common_chat_msg_delimiters_parse(const nlohmann::ordered_json & delimiters); diff --git a/common/common.cpp b/common/common.cpp index b01772e1cbfe..a14e7bbed99e 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1074,6 +1074,18 @@ std::vector fs_list(const std::string & path, bool include_dir return files; } +std::ifstream fs_open_ifstream(const std::string & fname, std::ios_base::openmode mode) { +#ifdef _WIN32 + int wlen = MultiByteToWideChar(CP_UTF8, 0, fname.c_str(), -1, NULL, 0); + if (!wlen) { return std::ifstream(); } + std::vector wfname(wlen); + (void)MultiByteToWideChar(CP_UTF8, 0, fname.c_str(), -1, wfname.data(), wlen); + return std::ifstream(wfname.data(), mode); +#else + return std::ifstream(fname, mode); +#endif +} + // // TTY utils // @@ -2034,7 +2046,7 @@ bool common_prompt_batch_decode( } size_t common_prompt_checkpoint::size() const { - return data_tgt.size() + data_dft.size(); + return data_tgt.size() + data_dft.size() + data_spec.size(); } bool common_prompt_checkpoint::empty() const { @@ -2049,6 +2061,7 @@ void common_prompt_checkpoint::clear() { data_tgt.clear(); data_dft.clear(); + data_spec.clear(); } void common_prompt_checkpoint::update_pos( @@ -2138,4 +2151,5 @@ void common_prompt_checkpoint::clear_tgt() { void common_prompt_checkpoint::clear_dft() { data_dft.clear(); + data_spec.clear(); } diff --git a/common/common.h b/common/common.h index 4864186f6287..94147d5d8cf1 100644 --- a/common/common.h +++ b/common/common.h @@ -96,6 +96,7 @@ enum llama_example { LLAMA_EXAMPLE_FIT_PARAMS, LLAMA_EXAMPLE_RESULTS, LLAMA_EXAMPLE_EXPORT_GRAPH_OPS, + LLAMA_EXAMPLE_DOWNLOAD, LLAMA_EXAMPLE_COUNT, }; @@ -290,12 +291,25 @@ struct common_params_sampling { }; struct common_params_model { - std::string path = ""; // model local path // NOLINT - std::string url = ""; // model url to download // NOLINT - std::string hf_repo = ""; // HF repo // NOLINT - std::string hf_file = ""; // HF file // NOLINT - std::string docker_repo = ""; // Docker repo // NOLINT - std::string name = ""; // in format /[:] (tag is optional) // NOLINT + std::string path = ""; // model local path + std::string url = ""; // model url to download + std::string hf_repo = ""; // HF repo + std::string hf_file = ""; // HF file + std::string docker_repo = ""; // Docker repo + + std::string get_name() const { + if (!hf_repo.empty()) { + return hf_repo; + } + if (!docker_repo.empty()) { + return docker_repo; + } + return path; + } + + bool empty() const { + return get_name().empty(); + } }; // draft-model-based speculative decoding parameters @@ -358,12 +372,12 @@ struct common_params_speculative { common_params_speculative_ngram_cache ngram_cache; bool has_dft() const { - return !draft.mparams.path.empty() || !draft.mparams.hf_repo.empty(); + return !draft.mparams.empty(); } uint32_t need_n_rs_seq() const { bool needs_rs_seq = std::any_of(types.begin(), types.end(), [&](auto t) { - return t == COMMON_SPECULATIVE_TYPE_DRAFT_MTP; + return t == COMMON_SPECULATIVE_TYPE_DRAFT_MTP || t == COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3; }); return needs_rs_seq ? draft.n_max : 0u; @@ -510,7 +524,6 @@ struct common_params { int32_t control_vector_layer_start = -1; // layer range for control vector int32_t control_vector_layer_end = -1; // layer range for control vector bool offline = false; - bool skip_download = false; // skip model file downloading int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used. int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line @@ -575,6 +588,7 @@ struct common_params { std::vector image; // path to image file(s) ; TODO: change the name to "media" int image_min_tokens = -1; int image_max_tokens = -1; + int mtmd_batch_max_tokens = 1024; // finetune struct lr_opt lr; @@ -599,7 +613,7 @@ struct common_params { bool cache_prompt = true; // whether to enable prompt caching bool cache_idle_slots = true; // save and clear idle slots upon starting a new task int32_t n_ctx_checkpoints = 32; // max number of context checkpoints per slot - int32_t checkpoint_min_step = 256; // minimum spacing between context checkpoints + int32_t checkpoint_min_step = 8192; // minimum spacing between context checkpoints int32_t cache_ram_mib = 8192; // -1 = no limit, 0 - disable, 1 = 1 MiB, etc. std::string hostname = "127.0.0.1"; @@ -623,12 +637,6 @@ struct common_params { // UI configs bool ui = true; - - // Deprecated: use ui, ui_mcp_proxy, ui_config_json instead - bool webui = ui; - bool webui_mcp_proxy = false; - std::string webui_config_json; - bool ui_mcp_proxy = false; std::string ui_config_json; @@ -641,10 +649,11 @@ struct common_params { std::vector server_tools; // router server configs - std::string models_dir = ""; // directory containing models for the router server - std::string models_preset = ""; // directory containing model presets for the router server - int models_max = 4; // maximum number of models to load simultaneously - bool models_autoload = true; // automatically load models when requested via the router server + std::string models_dir = ""; // directory containing models for the router server + std::string models_preset = ""; // directory containing model presets for the router server + int models_max = 4; // maximum number of models to load simultaneously + bool models_autoload = true; // automatically load models when requested via the router server + std::string models_preset_hf = ""; // show a warning about remote presets on router loaded (if not empty) bool log_json = false; @@ -846,6 +855,9 @@ struct common_file_info { }; std::vector fs_list(const std::string & path, bool include_directories); +// fs open, also handle UTF8 on Windows +std::ifstream fs_open_ifstream(const std::string & fname, std::ios_base::openmode mode); + // // TTY utils // @@ -1063,6 +1075,10 @@ struct common_prompt_checkpoint { std::vector data_tgt; std::vector data_dft; + // (optional) speculative-decoding implementation state stashed with the checkpoint + // (e.g. eagle3's deferred-boundary g_embd row) + std::vector data_spec; + size_t size() const; bool empty() const; diff --git a/common/download.cpp b/common/download.cpp index 40f6eb780f41..6b69a4418856 100644 --- a/common/download.cpp +++ b/common/download.cpp @@ -292,10 +292,6 @@ static int common_download_file_single_online(const std::string & url, const bool file_exists = std::filesystem::exists(path); - if (!file_exists && opts.skip_download) { - return -2; // file is missing and download is disabled - } - if (file_exists && skip_etag) { LOG_DBG("%s: using cached file: %s\n", __func__, path.c_str()); return 304; // 304 Not Modified - fake cached response @@ -362,9 +358,6 @@ static int common_download_file_single_online(const std::string & url, return 304; // 304 Not Modified - fake cached response } // pass this point, the file exists but is different from the server version, so we need to redownload it - if (opts.skip_download) { - return -2; // special code to indicate that the download was skipped due to etag mismatch - } if (remove(path.c_str()) != 0) { LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str()); return -1; @@ -691,18 +684,8 @@ static void list_available_gguf_files(const hf_cache::hf_files & files) { } } -struct hf_plan { - hf_cache::hf_file primary; - hf_cache::hf_files model_files; - hf_cache::hf_file mmproj; - hf_cache::hf_file mtp; -}; - -static hf_plan get_hf_plan(const common_params_model & model, - const common_download_opts & opts, - bool download_mmproj, - bool download_mtp) { - hf_plan plan; +common_download_hf_plan common_download_get_hf_plan(const common_params_model & model, const common_download_opts & opts) { + common_download_hf_plan plan; hf_cache::hf_files all; auto [repo, tag] = common_download_split_repo_tag(model.hf_repo); @@ -717,6 +700,14 @@ static hf_plan get_hf_plan(const common_params_model & model, return plan; } + // if preset.ini exists in the repo root, download only that file + for (const auto & f : all) { + if (f.path == "preset.ini") { + plan.preset = f; + return plan; + } + } + hf_cache::hf_file primary; if (!model.hf_file.empty()) { @@ -743,115 +734,49 @@ static hf_plan get_hf_plan(const common_params_model & model, plan.primary = primary; plan.model_files = get_split_files(all, primary); - if (download_mmproj) { + if (opts.download_mmproj) { plan.mmproj = find_best_mmproj(all, primary.path); } - - if (download_mtp) { + if (opts.download_mtp) { plan.mtp = find_best_mtp(all, primary.path); } return plan; } -struct download_task { - std::string url; - std::string path; -}; - -static std::vector get_url_tasks(const common_params_model & model) { - auto split = get_gguf_split_info(model.url); - - if (split.count <= 1) { - return {{model.url, model.path}}; - } - - auto filename = split.prefix; - if (auto pos = split.prefix.rfind('/'); pos != std::string::npos) { - filename = split.prefix.substr(pos + 1); - } - - auto parent_path = std::filesystem::path(model.path).parent_path(); - auto prefix_path = (parent_path / filename).string(); - - std::vector tasks; - for (int i = 1; i <= split.count; i++) { - auto suffix = string_format("-%05d-of-%05d.gguf", i, split.count); - tasks.push_back({split.prefix + suffix, prefix_path + suffix}); - } - return tasks; -} - -common_download_model_result common_download_model(const common_params_model & model, - const common_download_opts & opts) { - common_download_model_result result; - std::vector tasks; - hf_plan hf; - - bool download_mmproj = opts.download_mmproj; - bool download_mtp = opts.download_mtp; - bool is_hf = !model.hf_repo.empty(); - - if (is_hf) { - hf = get_hf_plan(model, opts, download_mmproj, download_mtp); - for (const auto & f : hf.model_files) { - tasks.push_back({f.url, f.local_path}); - } - if (!hf.mmproj.path.empty()) { - tasks.push_back({hf.mmproj.url, hf.mmproj.local_path}); - } - if (!hf.mtp.path.empty()) { - tasks.push_back({hf.mtp.url, hf.mtp.local_path}); - } - } else if (!model.url.empty()) { - tasks = get_url_tasks(model); - } else { - result.model_path = model.path; - return result; - } - - if (tasks.empty()) { - return result; - } - +void common_download_run_tasks(const std::vector & tasks) { std::vector> futures; for (const auto & task : tasks) { futures.push_back(std::async(std::launch::async, - [&task, &opts, is_hf]() { - return common_download_file_single(task.url, task.path, opts, is_hf); + [&task]() { + return common_download_file_single(task.url, task.local_path, task.opts, task.is_hf); } )); } - for (auto & f : futures) { - int status = f.get(); - if (status == -2 && opts.skip_download) { - throw common_skip_download_exception(); - } + for (size_t i = 0; i < futures.size(); ++i) { + std::string url = tasks[i].url; + int status = futures[i].get(); bool is_ok = is_http_status_ok(status); if (!is_ok) { - return {}; + throw std::runtime_error(string_format("Download '%s' failed with status code: %d", url.c_str(), status)); } } +} - if (is_hf) { - for (const auto & f : hf.model_files) { - hf_cache::finalize_file(f); - } - result.model_path = hf.primary.final_path; +std::vector common_download_get_all_parts(const std::string & url) { + auto split = get_gguf_split_info(url); - if (!hf.mmproj.path.empty()) { - result.mmproj_path = hf_cache::finalize_file(hf.mmproj); - } - - if (!hf.mtp.path.empty()) { - result.mtp_path = hf_cache::finalize_file(hf.mtp); - } - } else { - result.model_path = model.path; + if (split.count <= 1) { + return {url}; } - return result; + std::vector parts; + for (int i = 1; i <= split.count; i++) { + auto suffix = string_format("-%05d-of-%05d.gguf", i, split.count); + parts.push_back(split.prefix + suffix); + } + return parts; } // @@ -997,3 +922,87 @@ std::vector common_list_cached_models() { return result; } + +bool common_download_remove(const std::string & hf_repo_with_tag) { + namespace fs = std::filesystem; + + auto [repo_id, tag] = common_download_split_repo_tag(hf_repo_with_tag); + + if (tag.empty()) { + return hf_cache::remove_cached_repo(repo_id); + } + + std::string tag_upper = tag; + for (char & c : tag_upper) { + c = (char) std::toupper((unsigned char) c); + } + + auto files = hf_cache::get_cached_files(repo_id); + if (files.empty()) { + return false; + } + + // collect snapshot entries whose tag matches + std::vector to_remove; + for (const auto & f : files) { + auto split = get_gguf_split_info(f.path); + if (split.tag == tag_upper) { + to_remove.emplace_back(f.local_path); + } + } + + if (to_remove.empty()) { + return false; + } + + // resolve blob paths from symlinks before deleting snapshot entries + std::vector blobs_to_check; + for (const auto & p : to_remove) { + std::error_code ec; + if (fs::is_symlink(p, ec)) { + auto target = fs::read_symlink(p, ec); + if (!ec) { + blobs_to_check.push_back((p.parent_path() / target).lexically_normal()); + } + } + } + + // remove snapshot entries + for (const auto & p : to_remove) { + std::error_code ec; + fs::remove(p, ec); + if (ec) { + LOG_WRN("%s: failed to remove %s: %s\n", __func__, p.string().c_str(), ec.message().c_str()); + } + } + + if (blobs_to_check.empty()) { + return true; + } + + // collect blobs still referenced by remaining snapshot entries + std::unordered_set still_referenced; + for (const auto & f : hf_cache::get_cached_files(repo_id)) { + fs::path p(f.local_path); + std::error_code ec; + if (fs::is_symlink(p, ec)) { + auto target = fs::read_symlink(p, ec); + if (!ec) { + still_referenced.insert((p.parent_path() / target).lexically_normal().string()); + } + } + } + + // remove orphaned blobs + for (const auto & blob : blobs_to_check) { + if (still_referenced.find(blob.string()) == still_referenced.end()) { + std::error_code ec; + fs::remove(blob, ec); + if (ec) { + LOG_WRN("%s: failed to remove blob %s: %s\n", __func__, blob.string().c_str(), ec.message().c_str()); + } + } + } + + return true; +} diff --git a/common/download.h b/common/download.h index ebeedd6058c7..816e1c7f58a1 100644 --- a/common/download.h +++ b/common/download.h @@ -1,7 +1,10 @@ #pragma once +#include "hf-cache.h" + #include #include +#include struct common_params_model; @@ -47,65 +50,40 @@ struct common_cached_model_info { } }; -// Options for common_download_model and common_download_file_single +// Options for common_download_file_single struct common_download_opts { std::string bearer_token; common_header_list headers; bool offline = false; - bool skip_download = false; // if true, only validation is performed, common_skip_download_exception may be thrown if the file is missing or invalid bool download_mmproj = false; bool download_mtp = false; common_download_callback * callback = nullptr; }; -// Result of common_download_model -struct common_download_model_result { - std::string model_path; - std::string mmproj_path; - std::string mtp_path; +struct common_download_task { + common_download_opts opts; + std::string url; + std::string local_path; + std::function on_done; + bool is_hf = false; + + common_download_task() = default; + common_download_task(hf_cache::hf_file f, + const common_download_opts & opts, + std::function on_done = nullptr) + : opts(opts), url(f.url), local_path(f.local_path), on_done(on_done), is_hf(true) {} }; -// throw if the file is missing or invalid (e.g. ETag check failed) -struct common_skip_download_exception : public std::runtime_error { - common_skip_download_exception() : std::runtime_error("skip download") {} -}; +void common_download_run_tasks(const std::vector & tasks); -// Download model from HuggingFace repo or URL -// -// input (via model struct): -// - model.hf_repo: HF repo with optional tag, see common_download_split_repo_tag -// - model.hf_file: specific file in the repo (requires hf_repo) -// - model.url: simple download (used if hf_repo is empty) -// - model.path: local file path -// -// tag matching (for HF repos without model.hf_file): -// - if tag is specified, searches for GGUF matching that quantization -// - if no tag, searches for Q4_K_M, then Q4_0, then first available GGUF -// -// split GGUF: multi-part files like "model-00001-of-00003.gguf" are automatically -// detected and all parts are downloaded -// -// caching: -// - HF repos: uses HuggingFace cache -// - URLs: uses ETag-based caching -// -// when opts.offline=true, no network requests are made -// when download_mmproj=true, searches for mmproj in same directory as model or any parent directory -// then with the closest quantization bits -// when download_mtp=true, applies the same sibling search for an MTP-head GGUF -// -// returns result with model_path, mmproj_path and mtp_path (empty when not found / on failure) -common_download_model_result common_download_model( - const common_params_model & model, - const common_download_opts & opts = {} -); +// if url is a multi-part GGUF file, returns all parts, otherwise returns the single file +std::vector common_download_get_all_parts(const std::string & url); // returns list of cached models std::vector common_list_cached_models(); // download single file from url to local path // returns status code or -1 on error -// returns -2 if the download was skipped due to ETag mismatch (file outdated, skip_download=true) // skip_etag: if true, don't read/write .etag files (for HF cache where filename is the hash) int common_download_file_single(const std::string & url, const std::string & path, @@ -115,3 +93,19 @@ int common_download_file_single(const std::string & url, // resolve and download model from Docker registry // return local path to downloaded model file std::string common_docker_resolve_model(const std::string & docker); + +// Remove a cached model from disk +// input format: "user/model" or "user/model:tag" +// - if tag is omitted, removes the entire repo cache directory +// - if tag is present, removes only files matching that tag (and orphaned blobs) +// returns true if anything was removed +bool common_download_remove(const std::string & hf_repo_with_tag); + +struct common_download_hf_plan { + hf_cache::hf_file primary; + hf_cache::hf_files model_files; + hf_cache::hf_file mmproj; + hf_cache::hf_file mtp; + hf_cache::hf_file preset; // if set, only this file is downloaded +}; +common_download_hf_plan common_download_get_hf_plan(const common_params_model & model, const common_download_opts & opts); diff --git a/common/fit.cpp b/common/fit.cpp index 668d892e908c..a8565bfc9123 100644 --- a/common/fit.cpp +++ b/common/fit.cpp @@ -26,7 +26,7 @@ class common_params_fit_exception : public std::runtime_error { using std::runtime_error::runtime_error; }; -std::vector common_get_device_memory_data( +static std::vector common_get_device_memory_data_impl( const char * path_model, const llama_model_params * mparams, const llama_context_params * cparams, @@ -150,6 +150,29 @@ std::vector common_get_device_memory_data( return ret; } +common_device_memory_data_vec common_get_device_memory_data( + const char * path_model, + const llama_model_params * mparams, + const llama_context_params * cparams, + std::vector & devs, + uint32_t & hp_ngl, + uint32_t & hp_n_ctx_train, + uint32_t & hp_n_expert, + ggml_log_level log_level) { + std::vector impl = common_get_device_memory_data_impl( + path_model, mparams, cparams, devs, hp_ngl, hp_n_ctx_train, hp_n_expert, log_level); + + common_device_memory_data_vec ret(impl.size()); + for (size_t i = 0; i < impl.size(); i++) { + ret[i].total = impl[i].total; + ret[i].free = impl[i].free; + ret[i].model = impl[i].mb.model; + ret[i].context = impl[i].mb.context; + ret[i].compute = impl[i].mb.compute; + } + return ret; +} + static void common_params_fit_impl( const char * path_model, struct llama_model_params * mparams, struct llama_context_params * cparams, float * tensor_split, struct llama_model_tensor_buft_override * tensor_buft_overrides, @@ -169,7 +192,7 @@ static void common_params_fit_impl( // step 1: get data for default parameters and check whether any changes are necessary in the first place LOG_TRC("%s: getting device memory data for initial parameters:\n", __func__); - const dmds_t dmds_full = common_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level); + const dmds_t dmds_full = common_get_device_memory_data_impl(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level); const size_t nd = devs.size(); // number of devices std::vector margins; // this function uses int64_t rather than size_t for memory sizes to more conveniently handle deficits @@ -304,7 +327,7 @@ static void common_params_fit_impl( int64_t sum_projected_used_min_ctx = 0; cparams->n_ctx = n_ctx_min; - const dmds_t dmds_min_ctx = common_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level); + const dmds_t dmds_min_ctx = common_get_device_memory_data_impl(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level); if (nd == 0) { sum_projected_used_min_ctx = dmds_min_ctx.back().mb.total(); } else { @@ -482,7 +505,7 @@ static void common_params_fit_impl( llama_model_params mparams_copy = *mparams; set_ngl_tensor_split_tbo(ngl_per_device, overflow_bufts, mparams_copy); - const dmds_t dmd_nl = common_get_device_memory_data( + const dmds_t dmd_nl = common_get_device_memory_data_impl( path_model, &mparams_copy, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level); LOG_TRC("%s: memory for test allocation by device:\n", func_name); @@ -510,7 +533,7 @@ static void common_params_fit_impl( mparams->tensor_buft_overrides = tensor_buft_overrides; LOG_TRC("%s: getting device memory data with all MoE tensors moved to system memory:\n", __func__); - const dmds_t dmds_cpu_moe = common_get_device_memory_data( + const dmds_t dmds_cpu_moe = common_get_device_memory_data_impl( path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, log_level); for (size_t id = 0; id < nd; id++) { @@ -940,7 +963,7 @@ void common_fit_print( uint32_t hp_nct = 0; // hparams.n_ctx_train uint32_t hp_nex = 0; // hparams.n_expert - auto dmd = common_get_device_memory_data(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, GGML_LOG_LEVEL_ERROR); + auto dmd = common_get_device_memory_data_impl(path_model, mparams, cparams, devs, hp_ngl, hp_nct, hp_nex, GGML_LOG_LEVEL_ERROR); GGML_ASSERT(dmd.size() == devs.size() + 1); for (size_t id = 0; id < devs.size(); id++) { diff --git a/common/fit.h b/common/fit.h index 643d34200954..208fc30694e0 100644 --- a/common/fit.h +++ b/common/fit.h @@ -1,9 +1,7 @@ #pragma once #include "ggml.h" -#include "ggml-backend.h" #include "llama.h" -#include "../src/llama-ext.h" #include @@ -18,31 +16,41 @@ enum common_params_fit_status { // - this function is NOT thread safe because it modifies the global llama logger state // - only parameters that have the same value as in llama_default_model_params are modified // with the exception of the context size which is modified if and only if equal to 0 -enum common_params_fit_status common_fit_params( - const char * path_model, - struct llama_model_params * mparams, - struct llama_context_params * cparams, - float * tensor_split, // writable buffer for tensor split, needs at least llama_max_devices elements - struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements - size_t * margins, // margins of memory to leave per device in bytes - uint32_t n_ctx_min, // minimum context size to set when trying to reduce memory use - enum ggml_log_level log_level); // minimum log level to print during fitting, lower levels go to debug log +common_params_fit_status common_fit_params( + const char * path_model, + llama_model_params * mparams, + llama_context_params * cparams, + float * tensor_split, // writable buffer for tensor split, needs at least llama_max_devices elements + llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements + size_t * margins, // margins of memory to leave per device in bytes + uint32_t n_ctx_min, // minimum context size to set when trying to reduce memory use + ggml_log_level log_level); // minimum log level to print during fitting, lower levels go to debug log // print estimated memory to stdout void common_fit_print( - const char * path_model, - struct llama_model_params * mparams, - struct llama_context_params * cparams); + const char * path_model, + llama_model_params * mparams, + llama_context_params * cparams); -void common_memory_breakdown_print(const struct llama_context * ctx); +void common_memory_breakdown_print(const llama_context * ctx); + +struct common_device_memory_data { + int64_t total; + int64_t free; + size_t model; + size_t context; + size_t compute; +}; + +using common_device_memory_data_vec = std::vector; // Load a model + context with no_alloc and return the per-device memory breakdown. -std::vector common_get_device_memory_data( - const char * path_model, - const struct llama_model_params * mparams, - const struct llama_context_params * cparams, - std::vector & devs, - uint32_t & hp_ngl, - uint32_t & hp_n_ctx_train, - uint32_t & hp_n_expert, - enum ggml_log_level log_level); +common_device_memory_data_vec common_get_device_memory_data( + const char * path_model, + const llama_model_params * mparams, + const llama_context_params * cparams, + std::vector & devs, + uint32_t & hp_ngl, + uint32_t & hp_n_ctx_train, + uint32_t & hp_n_expert, + ggml_log_level log_level); diff --git a/common/hf-cache.cpp b/common/hf-cache.cpp index ba7417a12bb6..f1dacaa4778d 100644 --- a/common/hf-cache.cpp +++ b/common/hf-cache.cpp @@ -495,4 +495,19 @@ std::string finalize_file(const hf_file & file) { return file.final_path; } +bool remove_cached_repo(const std::string & repo_id) { + if (!is_valid_repo_id(repo_id)) { + LOG_WRN("%s: invalid repository: %s\n", __func__, repo_id.c_str()); + return false; + } + fs::path repo_path = get_repo_path(repo_id); + std::error_code ec; + auto removed = fs::remove_all(repo_path, ec); + if (ec) { + LOG_ERR("%s: failed to remove repo cache %s: %s\n", __func__, repo_path.string().c_str(), ec.message().c_str()); + return false; + } + return removed > 0; +} + } // namespace hf_cache diff --git a/common/hf-cache.h b/common/hf-cache.h index 23fa0adb729d..42c9c6ce34f0 100644 --- a/common/hf-cache.h +++ b/common/hf-cache.h @@ -29,4 +29,7 @@ hf_files get_cached_files(const std::string & repo_id = {}); // Create snapshot path (link or move/copy) and return it std::string finalize_file(const hf_file & file); +// Remove the entire cached directory for a repo, returns true if removed +bool remove_cached_repo(const std::string & repo_id); + } // namespace hf_cache diff --git a/common/jinja/runtime.cpp b/common/jinja/runtime.cpp index f81d98d954a1..f98cb0876f19 100644 --- a/common/jinja/runtime.cpp +++ b/common/jinja/runtime.cpp @@ -316,12 +316,22 @@ value filter_expression::execute_impl(context & ctx) { JJ_DEBUG("Applying filter to %s", input->type().c_str()); + auto set_filter_alias = [](auto & filter_id) { + if (filter_id == "count") { + filter_id = "length"; + } else if (filter_id == "d") { + filter_id = "default"; + } else if (filter_id == "e") { + filter_id = "escape"; + } else if (filter_id == "trim") { + filter_id = "strip"; + } + }; + if (is_stmt(filter)) { auto filter_id = cast_stmt(filter)->val; - if (filter_id == "trim") { - filter_id = "strip"; // alias - } + set_filter_alias(filter_id); JJ_DEBUG("Applying filter '%s' to %s", filter_id.c_str(), input->type().c_str()); // TODO: Refactor filters so this coercion can be done automatically if (!input->is_undefined() && !is_val(input) && ( @@ -345,9 +355,7 @@ value filter_expression::execute_impl(context & ctx) { } auto filter_id = cast_stmt(call->callee)->val; - if (filter_id == "trim") { - filter_id = "strip"; // alias - } + set_filter_alias(filter_id); JJ_DEBUG("Applying filter '%s' with arguments to %s", filter_id.c_str(), input->type().c_str()); func_args args(ctx); for (const auto & arg_expr : call->args) { @@ -678,59 +686,62 @@ value set_statement::execute_impl(context & ctx) { return mk_val(); } +static inline void bind_parameters(const std::string & name, const statements & this_args, const func_args & args, context & ctx) { + const size_t expected_count = this_args.size(); + const size_t input_count = args.count(); + + JJ_DEBUG("Invoking '%s' with %zu input arguments (expected %zu)", name.c_str(), input_count, expected_count); + for (size_t i = 0; i < expected_count; ++i) { + if (i < input_count) { + if (is_stmt(this_args[i])) { + // normal parameter + std::string param_name = cast_stmt(this_args[i])->val; + value param_value = args.get_kwarg_or_pos(param_name, i); + JJ_DEBUG(" Binding parameter '%s' to argument of type %s", param_name.c_str(), param_value->type().c_str()); + ctx.set_val(param_name, param_value); + } else if (is_stmt(this_args[i])) { + // default argument used as normal parameter + auto kwarg = cast_stmt(this_args[i]); + if (!is_stmt(kwarg->key)) { + throw std::runtime_error("Keyword argument key must be an identifier in '" + name + "'"); + } + std::string param_name = cast_stmt(kwarg->key)->val; + value param_value = args.get_kwarg_or_pos(param_name, i); + JJ_DEBUG(" Binding parameter '%s' to argument of type %s", param_name.c_str(), param_value->type().c_str()); + ctx.set_val(param_name, param_value); + } else { + throw std::runtime_error("Invalid parameter type in '" + name + "'"); + } + } else { + auto & default_arg = this_args[i]; + if (is_stmt(default_arg)) { + auto kwarg = cast_stmt(default_arg); + if (!is_stmt(kwarg->key)) { + throw std::runtime_error("Keyword argument key must be an identifier in '" + name + "'"); + } + std::string param_name = cast_stmt(kwarg->key)->val; + JJ_DEBUG(" Binding parameter '%s' to default argument of type %s", param_name.c_str(), kwarg->val->type().c_str()); + ctx.set_val(param_name, kwarg->val->execute(args.ctx)); + } else { + throw std::runtime_error("Not enough arguments provided to '" + name + "'"); + } + //std::string param_name = cast_stmt(default_args[i])->val; + //JJ_DEBUG(" Binding parameter '%s' to default", param_name.c_str()); + //ctx.var[param_name] = default_args[i]->execute(ctx); + } + } +} + value macro_statement::execute_impl(context & ctx) { if (!is_stmt(this->name)) { throw std::runtime_error("Macro name must be an identifier"); } std::string name = cast_stmt(this->name)->val; - const func_handler func = [this, name, &ctx](const func_args & args) -> value { - size_t expected_count = this->args.size(); - size_t input_count = args.count(); - - JJ_DEBUG("Invoking macro '%s' with %zu input arguments (expected %zu)", name.c_str(), input_count, expected_count); - context macro_ctx(ctx); // new scope for macro execution - - // bind parameters - for (size_t i = 0; i < expected_count; ++i) { - if (i < input_count) { - if (is_stmt(this->args[i])) { - // normal parameter - std::string param_name = cast_stmt(this->args[i])->val; - value param_value = args.get_kwarg_or_pos(param_name, i); - JJ_DEBUG(" Binding parameter '%s' to argument of type %s", param_name.c_str(), param_value->type().c_str()); - macro_ctx.set_val(param_name, param_value); - } else if (is_stmt(this->args[i])) { - // default argument used as normal parameter - auto kwarg = cast_stmt(this->args[i]); - if (!is_stmt(kwarg->key)) { - throw std::runtime_error("Keyword argument key must be an identifier in macro '" + name + "'"); - } - std::string param_name = cast_stmt(kwarg->key)->val; - value param_value = args.get_kwarg_or_pos(param_name, i); - JJ_DEBUG(" Binding parameter '%s' to argument of type %s", param_name.c_str(), param_value->type().c_str()); - macro_ctx.set_val(param_name, param_value); - } else { - throw std::runtime_error("Invalid parameter type in macro '" + name + "'"); - } - } else { - auto & default_arg = this->args[i]; - if (is_stmt(default_arg)) { - auto kwarg = cast_stmt(default_arg); - if (!is_stmt(kwarg->key)) { - throw std::runtime_error("Keyword argument key must be an identifier in macro '" + name + "'"); - } - std::string param_name = cast_stmt(kwarg->key)->val; - JJ_DEBUG(" Binding parameter '%s' to default argument of type %s", param_name.c_str(), kwarg->val->type().c_str()); - macro_ctx.set_val(param_name, kwarg->val->execute(ctx)); - } else { - throw std::runtime_error("Not enough arguments provided to macro '" + name + "'"); - } - //std::string param_name = cast_stmt(default_args[i])->val; - //JJ_DEBUG(" Binding parameter '%s' to default", param_name.c_str()); - //macro_ctx.var[param_name] = default_args[i]->execute(ctx); - } - } + const func_handler func = [this, name](const func_args & args) -> value { + context macro_ctx(args.ctx); // new scope for macro execution + + bind_parameters(name, this->args, args, macro_ctx); // execute macro body JJ_DEBUG("Executing macro '%s' body with %zu statements", name.c_str(), this->body.size()); @@ -744,6 +755,46 @@ value macro_statement::execute_impl(context & ctx) { return mk_val(); } +value call_statement::execute_impl(context & ctx) { + auto call_expr = cast_stmt(this->call); + if (!call_expr) { + throw std::runtime_error("Call statement requires a valid call expression"); + } + + value callee_val = call_expr->callee->execute(ctx); + if (!is_val(callee_val)) { + throw std::runtime_error("Callee is not a function: got " + callee_val->type()); + } + auto * callee_func = cast_val(callee_val); + + context caller_ctx(ctx); // new scope for caller execution + + const func_handler func = [this, caller_ctx = std::move(caller_ctx)](const func_args & args) -> value { + context block_ctx(caller_ctx); // new scope for block execution + + bind_parameters("caller", this->caller_args, args, block_ctx); + + JJ_DEBUG("Executing call body with %zu statements", this->body.size()); + auto res = exec_statements(this->body, block_ctx); + JJ_DEBUG("Call body execution complete, result: %s", res->val_str.str().c_str()); + return res; + }; + + context call_ctx(ctx); + call_ctx.set_val("caller", mk_val("caller", func)); + + func_args args(call_ctx); + + for (const auto & arg_expr : call_expr->args) { + auto arg_val = arg_expr->execute(ctx); + JJ_DEBUG(" Argument type: %s", arg_val->type().c_str()); + args.push_back(arg_val); + } + + JJ_DEBUG("Calling macro '%s' with %zu arguments", callee_func->name.c_str(), args.count()); + return callee_func->invoke(args); +} + value member_expression::execute_impl(context & ctx) { value object = this->object->execute(ctx); @@ -761,9 +812,9 @@ value member_expression::execute_impl(context & ctx) { if (is_stmt(this->property)) { auto s = cast_stmt(this->property); - value start_val = s->start_expr ? s->start_expr->execute(ctx) : mk_val(0); - value stop_val = s->stop_expr ? s->stop_expr->execute(ctx) : mk_val(arr_size); value step_val = s->step_expr ? s->step_expr->execute(ctx) : mk_val(1); + value start_val = s->start_expr ? s->start_expr->execute(ctx) : (step_val->as_int() < 0 ? mk_val(arr_size - 1) : mk_val(0)); + value stop_val = s->stop_expr ? s->stop_expr->execute(ctx) : (step_val->as_int() < 0 ? mk_val(-1) : mk_val(arr_size)); // translate to function call: obj.slice(start, stop, step) JJ_DEBUG("Member expression is a slice: start %s, stop %s, step %s", diff --git a/common/jinja/runtime.h b/common/jinja/runtime.h index b6f4a6ab48e4..37b4c35cac8f 100644 --- a/common/jinja/runtime.h +++ b/common/jinja/runtime.h @@ -552,6 +552,7 @@ struct call_statement : public statement { for (const auto & arg : this->caller_args) chk_type(arg); } std::string type() const override { return "CallStatement"; } + value execute_impl(context & ctx) override; }; struct ternary_expression : public expression { diff --git a/common/jinja/value.cpp b/common/jinja/value.cpp index 0b79098cd1e7..cd6a36956cea 100644 --- a/common/jinja/value.cpp +++ b/common/jinja/value.cpp @@ -90,14 +90,14 @@ static T slice(const T & array, int64_t start, int64_t stop, int64_t step = 1) { stop_val = std::min(stop_val, len); } } else { - start_val = len - 1; + start_val = start; if (start_val < 0) { - start_val = std::max(len + start_val, (int64_t)-1); + start_val = std::max(len + start_val, (int64_t)0); } else { start_val = std::min(start_val, len - 1); } - stop_val = -1; + stop_val = stop; if (stop_val < -1) { stop_val = std::max(len + stop_val, (int64_t)-1); } else { @@ -673,6 +673,9 @@ const func_builtins & value_string_t::get_builtins() const { std::string str = val_input->as_string().str(); // FIXME: Support non-specified delimiter (split on consecutive (no leading or trailing) whitespace) std::string delim = (args.count() > 1) ? args.get_pos(1)->as_string().str() : " "; + if (delim.empty()) { + throw raised_exception("empty separator"); + } int64_t maxsplit = (args.count() > 2) ? args.get_pos(2)->as_int() : -1; auto result = mk_val(); size_t pos = 0; @@ -697,6 +700,9 @@ const func_builtins & value_string_t::get_builtins() const { std::string str = val_input->as_string().str(); // FIXME: Support non-specified delimiter (split on consecutive (no leading or trailing) whitespace) std::string delim = (args.count() > 1) ? args.get_pos(1)->as_string().str() : " "; + if (delim.empty()) { + throw raised_exception("empty separator"); + } int64_t maxsplit = (args.count() > 2) ? args.get_pos(2)->as_int() : -1; auto result = mk_val(); size_t pos = 0; @@ -722,10 +728,23 @@ const func_builtins & value_string_t::get_builtins() const { if (count > 0) { throw not_implemented_exception("String replace with count argument not implemented"); } - size_t pos = 0; - while ((pos = str.find(old_str, pos)) != std::string::npos) { - str.replace(pos, old_str.length(), new_str); - pos += new_str.length(); + if (old_str != new_str) { + size_t pos = 0; + if (old_str.empty()) { + std::string new_res; + new_res.reserve(str.length() + new_str.length() * (str.length() + 1)); + new_res += new_str; + for (const char c : str) { + new_res.push_back(c); + new_res += new_str; + } + str = new_res; + } else { + while ((pos = str.find(old_str, pos)) != std::string::npos) { + str.replace(pos, old_str.length(), new_str); + pos += new_str.length(); + } + } } auto res = mk_val(str); res->val_str.mark_input_based_on(args.get_pos(0)->val_str); diff --git a/common/json-partial.cpp b/common/json-partial.cpp deleted file mode 100644 index aaf11310ab8a..000000000000 --- a/common/json-partial.cpp +++ /dev/null @@ -1,324 +0,0 @@ -#include "json-partial.h" - -#include "log.h" - -#include - -#include -#include - -using json = nlohmann::ordered_json; - -enum common_json_stack_element_type { - COMMON_JSON_STACK_ELEMENT_OBJECT, - COMMON_JSON_STACK_ELEMENT_KEY, - COMMON_JSON_STACK_ELEMENT_ARRAY, -}; - -struct common_json_stack_element { - common_json_stack_element_type type; - std::string key; -}; - -bool common_json_parse( - const std::string & input, - const std::string & healing_marker, - common_json & out) -{ - std::string::const_iterator it = input.begin(); - const auto end = input.end(); - return common_json_parse(it, end, healing_marker, out); -} - -bool common_json_parse( - std::string::const_iterator & it, - const std::string::const_iterator & end, - const std::string & healing_marker, - common_json & out) -{ - // // https://json.nlohmann.me/features/parsing/sax_interface/ - struct json_error_locator : public nlohmann::json_sax { - std::size_t position; - bool found_error; - std::string last_token; - std::string exception_message; - std::vector stack; - - json_error_locator() : position(0), found_error(false) {} - - bool parse_error(std::size_t position, const std::string & last_token, const json::exception & ex) override { // NOLINT - this->position = position - 1; - this->found_error = true; - this->last_token = last_token; - this->exception_message = ex.what(); - return false; - } - void close_value() { - if (!stack.empty() && (stack.back().type == COMMON_JSON_STACK_ELEMENT_KEY)) { - stack.pop_back(); - } - } - bool null() override { // NOLINT - close_value(); - return true; - } - bool boolean(bool) override { // NOLINT - close_value(); - return true; - } - bool number_integer(number_integer_t) override { // NOLINT - close_value(); - return true; - } - bool number_unsigned(number_unsigned_t) override { // NOLINT - close_value(); - return true; - } - bool number_float(number_float_t, const string_t &) override { // NOLINT - close_value(); - return true; - } - bool string(string_t &) override { // NOLINT - close_value(); - return true; - } - bool binary(binary_t &) override { // NOLINT - close_value(); - return true; - } - bool start_object(std::size_t) override { // NOLINT - stack.push_back({COMMON_JSON_STACK_ELEMENT_OBJECT, ""}); - return true; - } - bool end_object() override { - GGML_ASSERT(!stack.empty() && stack.back().type == COMMON_JSON_STACK_ELEMENT_OBJECT); - stack.pop_back(); - close_value(); - return true; - } - bool key(string_t & key) override { // NOLINT - stack.push_back({COMMON_JSON_STACK_ELEMENT_KEY, key}); - return true; - } - bool start_array(std::size_t) override { // NOLINT - stack.push_back({COMMON_JSON_STACK_ELEMENT_ARRAY, ""}); - return true; - } - bool end_array() override { - GGML_ASSERT(!stack.empty() && stack.back().type == COMMON_JSON_STACK_ELEMENT_ARRAY); - stack.pop_back(); - close_value(); - return true; - } - }; - json_error_locator err_loc; - auto start = it; - json::sax_parse(it, end, &err_loc); - - if (err_loc.found_error) { - it = start; - auto temptative_end = it + err_loc.position; - // LOG_DBG("Error at position %zu (is_end = %s): %s\n", err_loc.position, temptative_end == end ? "true" : "false", err_loc.exception_message.c_str()); - - auto input = std::string(it, temptative_end); - try { - out.json = json::parse(input); - // out.json = json::parse(it, temptative_end); - it = temptative_end; - return true; - } catch (const std::exception & ex) { - // No, needs healing. - LOG_DBG("Failed to parse up to error: %s: <<<%s>>>\n", ex.what(), std::string(it, temptative_end).c_str()); - } - auto can_parse = [](const std::string & str) { - try { - auto _ = json::parse(str); // NOLINT - return true; - } catch (const std::exception &) { - return false; - } - }; - if (!healing_marker.empty() && !err_loc.stack.empty()) { - std::string str(it, temptative_end); - auto last_non_sp_pos = str.find_last_not_of(" \n\r\t"); - if (last_non_sp_pos == std::string::npos) { - throw std::runtime_error("Cannot heal a truncated JSON that stopped in an unknown location"); - } - auto last_non_sp_char = str[last_non_sp_pos]; - // Used to detect stops on a number, which may not be complete. - auto was_maybe_number = [&]() { - if (!str.empty() && std::isspace(str.back())) { - return false; - } - return std::isdigit(last_non_sp_char) || - last_non_sp_char == '.' || - last_non_sp_char == 'e' || - last_non_sp_char == 'E' || - last_non_sp_char == '-'; - }; - - std::string closing; - for (size_t i = err_loc.stack.size(); i > 0; i--) { - auto & el = err_loc.stack[i - 1]; - if (el.type == COMMON_JSON_STACK_ELEMENT_OBJECT) { - closing += "}"; - } else if (el.type == COMMON_JSON_STACK_ELEMENT_ARRAY) { - closing += "]"; - } else if (el.type != COMMON_JSON_STACK_ELEMENT_KEY) { - throw std::runtime_error("Unexpected stack element type"); - } - } - - // Matches a potentially partial unicode escape sequence, e.g. \u, \uX, \uXX, \uXXX, \uXXXX - static const std::regex partial_unicode_regex(R"(\\u(?:[0-9a-fA-F](?:[0-9a-fA-F](?:[0-9a-fA-F](?:[0-9a-fA-F])?)?)?)?$)"); - - auto is_high_surrogate = [&](const std::string & s) { - // Check if a partial of a high surrogate (U+D800-U+DBFF) - return s.length() >= 4 && - s[0] == '\\' && s[1] == 'u' && - std::tolower(s[2]) == 'd' && - (s[3] == '8' || s[3] == '9' || std::tolower(s[3]) == 'a' || std::tolower(s[3]) == 'b'); - }; - - // Initialize the unicode marker to a low surrogate to handle the edge case - // where a high surrogate (U+D800-U+DBFF) is immediately followed by a - // backslash (\) - std::string unicode_marker_padding = "udc00"; - std::smatch last_unicode_seq; - - if (std::regex_search(str, last_unicode_seq, partial_unicode_regex)) { - std::smatch second_last_seq; - std::string prelude = str.substr(0, last_unicode_seq.position()); - - // Pad the escape sequence with 0s until it forms a complete sequence of 6 characters - unicode_marker_padding = std::string(6 - last_unicode_seq.length(), '0'); - - if (is_high_surrogate(last_unicode_seq.str())) { - // If the sequence is a partial match for a high surrogate, add a low surrogate (U+DC00-U+UDFF) - unicode_marker_padding += "\\udc00"; - } else if (std::regex_search(prelude, second_last_seq, partial_unicode_regex)) { - if (is_high_surrogate(second_last_seq.str())) { - // If this follows a high surrogate, pad it to be a low surrogate - if (last_unicode_seq.length() == 2) { - unicode_marker_padding = "dc00"; - } else if (last_unicode_seq.length() == 3) { - unicode_marker_padding = "c00"; - } else { - // The original unicode_marker_padding is already padded with 0s - } - } - } - } - - const auto & magic_seed = out.healing_marker.marker = healing_marker;//"$llama.cpp.json$"; - - if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_KEY) { - // We're inside an object value - if (last_non_sp_char == ':' && can_parse(str + "1" + closing)) { - // Was about to create an object value - str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing; - } else if (can_parse(str + ": 1" + closing)) { - str += (out.healing_marker.json_dump_marker = ":\"" + magic_seed) + "\"" + closing; - } else if (last_non_sp_char == '{' && can_parse(str + closing)) { - // Was about to create an object - str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\": 1" + closing; - } else if (can_parse(str + "\"" + closing)) { - // Was inside an object value string - str += (out.healing_marker.json_dump_marker = magic_seed) + "\"" + closing; - } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"" + closing)) { - // Was inside an object value string after an escape - str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"" + closing; - } else if (can_parse(str + unicode_marker_padding + "\"" + closing)) { - // Was inside an object value string after a partial unicode escape - str += (out.healing_marker.json_dump_marker = unicode_marker_padding + magic_seed) + "\"" + closing; - } else { - // find last : - auto last_pos = str.find_last_of(':'); - if (last_pos == std::string::npos) { - throw std::runtime_error("Cannot heal a truncated JSON that stopped in an unknown location"); - } - // Cutting back to opening : for object value - str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing; - } - } else if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_ARRAY) { - if ((last_non_sp_char == ',' || last_non_sp_char == '[') && can_parse(str + "1" + closing)) { - // Was about to create an array value - str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing; - } else if (can_parse(str + "\"" + closing)) { - // Was inside an array value string - str += (out.healing_marker.json_dump_marker = magic_seed) + "\"" + closing; - } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"" + closing)) { - // Was inside an array value string after an escape - str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\"" + closing; - } else if (can_parse(str + unicode_marker_padding + "\"" + closing)) { - // Was inside an array value string after a partial unicode escape - str += (out.healing_marker.json_dump_marker = unicode_marker_padding + magic_seed) + "\"" + closing; - } else if (!was_maybe_number() && can_parse(str + ", 1" + closing)) { - // Had just finished a value - str += (out.healing_marker.json_dump_marker = ",\"" + magic_seed) + "\"" + closing; - } else { - auto last_pos = str.find_last_of("[,"); - if (last_pos == std::string::npos) { - throw std::runtime_error("Cannot heal a truncated JSON array stopped in an unknown location"); - } - // Cutting back to last [ or , for array value - str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing; - } - } else if (err_loc.stack.back().type == COMMON_JSON_STACK_ELEMENT_OBJECT) { - if ((last_non_sp_char == '{' && can_parse(str + closing)) || - (last_non_sp_char == ',' && can_parse(str + "\"\": 1" + closing))) { - // Was about to create an object key+value - str += (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\": 1" + closing; - } else if (!was_maybe_number() && can_parse(str + ",\"\": 1" + closing)) { - // Was about to create an object key+value - str += (out.healing_marker.json_dump_marker = ",\"" + magic_seed) + "\": 1" + closing; - } else if (can_parse(str + "\": 1" + closing)) { - // Was inside an object key string - str += (out.healing_marker.json_dump_marker = magic_seed) + "\": 1" + closing; - } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\": 1" + closing)) { - // Was inside an object key string after an escape - str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\": 1" + closing; - } else if (can_parse(str + unicode_marker_padding + "\": 1" + closing)) { - // Was inside an object key string after a partial unicode escape - str += (out.healing_marker.json_dump_marker = unicode_marker_padding + magic_seed) + "\": 1" + closing; - } else { - auto last_pos = str.find_last_of(':'); - if (last_pos == std::string::npos) { - throw std::runtime_error("Cannot heal a truncated JSON object stopped in an unknown location"); - } - // fprintf(stderr, "Cutting back to last : for object key+value\n"); - str = str.substr(0, last_pos + 1) + (out.healing_marker.json_dump_marker = "\"" + magic_seed) + "\"" + closing; - } - } else { - throw std::runtime_error("Cannot heal a truncated JSON object stopped in an unknown location"); - } - // fprintf(stderr, "HEALED:\nSTRING <<<\n%s\n>>>\n\nmagic_cut: <<<\n%s\n>>>\n\n", str.c_str(), out.healing_marker.json_dump_marker.c_str()); - out.json = json::parse(str); - it = temptative_end; - return true; - } - // handle unclosed top-level primitive - if (err_loc.position != 0 && !healing_marker.empty() && err_loc.stack.empty()) { - std::string str(it, temptative_end); - const auto & magic_seed = out.healing_marker.marker = healing_marker; - if (can_parse(str + "\"")) { - // Was inside an string - str += (out.healing_marker.json_dump_marker = magic_seed) + "\""; - } else if (str[str.length() - 1] == '\\' && can_parse(str + "\\\"")) { - // Was inside an string after an escape - str += (out.healing_marker.json_dump_marker = "\\" + magic_seed) + "\""; - } else { - // TODO: handle more unclosed top-level primitive if the stack was empty but we got an error (e.g. "tru", "\"", etc...) - // fprintf(stderr, "Closing: TODO\n"); - return false; - } - out.json = json::parse(str); - it = temptative_end; - return true; - } - return false; - } - out.json = json::parse(it, end); - it = end; - return true; -} diff --git a/common/json-partial.h b/common/json-partial.h deleted file mode 100644 index be51aabfbf41..000000000000 --- a/common/json-partial.h +++ /dev/null @@ -1,39 +0,0 @@ -#pragma once - -// TODO: use json_fwd.hpp when possible -#include - -// Healing marker (empty if the JSON was fully parsed / wasn't healed). -struct common_healing_marker { - // Raw marker. - std::string marker; - - // Cutting the `common_json.json.dump()` string at the (only) occurrence of this marker should yield the original partial JSON string (modulo spaces / if it had the same dump format). - std::string json_dump_marker; -}; - -// Represents a parsed JSON object, with its optional healing marker (a JSON dump fragment that can be used to find the position of healing in the JSON dump string) -struct common_json { - nlohmann::ordered_json json; - - common_healing_marker healing_marker; -}; - -// Parse the JSON string, healing (closing) any partial JSON if `healing_marker` is not empty. -// -// Healing completes partial JSON strings by adding a (possibly modified) healing marker, then whatever is needed to close the JSON. -// This allows to parse the resulting healed JSON string, yet be able to cut it again if needed at the healing marker. -// (this is used when parsing JSON outputs from the models, then crafting partial JSONs for the partial tool calls in OAI format). -// -// For instance, parsing `{` with a healing marker `foo` will produce a healed JSON `{"foo":1}`, w/ json_dump_marker = `"foo"` (which can be used to break the JSON again). -bool common_json_parse( - const std::string & input, - const std::string & healing_marker, - common_json & out); - -// Parse the JSON string (see overload above), but advancing an iterator to the end of the input when the (potentially partial) parsing succeeds. -bool common_json_parse( - std::string::const_iterator & it, - const std::string::const_iterator & end, - const std::string & healing_marker, - common_json & out); diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp index e2c4d6ce22e5..b18607cd6542 100644 --- a/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp @@ -233,27 +233,27 @@ struct BuiltinRule { }; static std::unordered_map PRIMITIVE_RULES = { - {"boolean", {"(\"true\" | \"false\") space", {}}}, + {"boolean", {"(\"true\" | \"false\")", {}}}, {"decimal-part", {"[0-9]{1,16}", {}}}, {"integral-part", {"[0] | [1-9] [0-9]{0,15}", {}}}, - {"number", {"(\"-\"? integral-part) (\".\" decimal-part)? ([eE] [-+]? integral-part)? space", {"integral-part", "decimal-part"}}}, - {"integer", {"(\"-\"? integral-part) space", {"integral-part"}}}, + {"number", {"(\"-\"? integral-part) (\".\" decimal-part)? ([eE] [-+]? integral-part)?", {"integral-part", "decimal-part"}}}, + {"integer", {"(\"-\"? integral-part)", {"integral-part"}}}, {"value", {"object | array | string | number | boolean | null", {"object", "array", "string", "number", "boolean", "null"}}}, - {"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space", {"string", "value"}}}, - {"array", {"\"[\" space ( value (\",\" space value)* )? \"]\" space", {"value"}}}, - {"uuid", {"\"\\\"\" [0-9a-fA-F]{8} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{12} \"\\\"\" space", {}}}, + {"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? space \"}\"", {"string", "value"}}}, + {"array", {"\"[\" space ( value (\",\" space value)* )? space \"]\"", {"value"}}}, + {"uuid", {"\"\\\"\" [0-9a-fA-F]{8} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{12} \"\\\"\"", {}}}, {"char", {"[^\"\\\\\\x7F\\x00-\\x1F] | [\\\\] ([\"\\\\bfnrt] | \"u\" [0-9a-fA-F]{4})", {}}}, - {"string", {"\"\\\"\" char* \"\\\"\" space", {"char"}}}, - {"null", {"\"null\" space", {}}}, + {"string", {"\"\\\"\" char* \"\\\"\"", {"char"}}}, + {"null", {"\"null\"", {}}}, }; static std::unordered_map STRING_FORMAT_RULES = { {"date", {"[0-9]{4} \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )", {}}}, {"time", {"([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9]{3} )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )", {}}}, {"date-time", {"date \"T\" time", {"date", "time"}}}, - {"date-string", {"\"\\\"\" date \"\\\"\" space", {"date"}}}, - {"time-string", {"\"\\\"\" time \"\\\"\" space", {"time"}}}, - {"date-time-string", {"\"\\\"\" date-time \"\\\"\" space", {"date-time"}}} + {"date-string", {"\"\\\"\" date \"\\\"\"", {"date"}}}, + {"time-string", {"\"\\\"\" time \"\\\"\"", {"time"}}}, + {"date-time-string", {"\"\\\"\" date-time \"\\\"\"", {"date-time"}}} }; static bool is_reserved_name(const std::string & name) { @@ -551,16 +551,16 @@ class common_schema_converter { } return join_seq(); }; - return _add_rule(name, "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space"); + return _add_rule(name, "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\""); } /* Returns a rule that matches a JSON string that is none of the provided strings not_strings({"a"}) - -> ["] ( [a] char+ | [^"a] char* )? ["] space + -> ["] ( [a] char+ | [^"a] char* )? ["] not_strings({"and", "also"}) - -> ["] ( [a] ([l] ([s] ([o] char+ | [^"o] char*) | [^"s] char*) | [n] ([d] char+ | [^"d] char*) | [^"ln] char*) | [^"a] char* )? ["] space + -> ["] ( [a] ([l] ([s] ([o] char+ | [^"o] char*) | [^"s] char*) | [n] ([d] char+ | [^"d] char*) | [^"ln] char*) | [^"a] char* )? ["] */ std::string _not_strings(const std::vector & strings) { @@ -619,7 +619,7 @@ class common_schema_converter { if (!trie.is_end_of_string) { out << "?"; } - out << " [\"] space"; + out << " [\"]"; return out.str(); } @@ -725,7 +725,7 @@ class common_schema_converter { rule += " )?"; } - rule += " \"}\" space"; + rule += " space \"}\""; return rule; } @@ -858,14 +858,14 @@ class common_schema_converter { return _add_rule(rule_name, _generate_union_rule(name, schema_types)); } if (schema.contains("const")) { - return _add_rule(rule_name, _generate_constant_rule(schema["const"]) + " space"); + return _add_rule(rule_name, _generate_constant_rule(schema["const"])); } if (schema.contains("enum")) { std::vector enum_values; for (const auto & v : schema["enum"]) { enum_values.push_back(_generate_constant_rule(v)); } - return _add_rule(rule_name, "(" + string_join(enum_values, " | ") + ") space"); + return _add_rule(rule_name, "(" + string_join(enum_values, " | ") + ")"); } if ((schema_type.is_null() || schema_type == "object") && (schema.contains("properties") || @@ -933,7 +933,7 @@ class common_schema_converter { } } if (!enum_intersection.empty()) { - return _add_rule(rule_name, "(" + string_join(enum_intersection, " | ") + ") space"); + return _add_rule(rule_name, "(" + string_join(enum_intersection, " | ") + ")"); } } return _add_rule(rule_name, _build_object_rule(properties, required, hybrid_name, json())); @@ -948,7 +948,7 @@ class common_schema_converter { } rule += visit(items[i], name + (name.empty() ? "" : "-") + "tuple-" + std::to_string(i)); } - rule += " \"]\" space"; + rule += " space \"]\""; return _add_rule(rule_name, rule); } std::string item_rule_name = visit(items, name + (name.empty() ? "" : "-") + "item"); @@ -956,7 +956,7 @@ class common_schema_converter { json max_items_json = schema.contains("maxItems") ? schema["maxItems"] : json(); int max_items = max_items_json.is_number_integer() ? max_items_json.get() : std::numeric_limits::max(); - return _add_rule(rule_name, "\"[\" space " + build_repetition(item_rule_name, min_items, max_items, "\",\" space") + " \"]\" space"); + return _add_rule(rule_name, "\"[\" space " + build_repetition(item_rule_name, min_items, max_items, "\",\" space") + " space \"]\""); } if ((schema_type.is_null() || schema_type == "string") && schema.contains("pattern")) { return _visit_pattern(schema["pattern"], rule_name); @@ -972,7 +972,7 @@ class common_schema_converter { std::string char_rule = _add_primitive("char", PRIMITIVE_RULES.at("char")); int min_len = schema.contains("minLength") ? schema["minLength"].get() : 0; int max_len = schema.contains("maxLength") ? schema["maxLength"].get() : std::numeric_limits::max(); - return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\" space"); + return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\""); } if (schema_type == "integer" && (schema.contains("minimum") || schema.contains("exclusiveMinimum") || schema.contains("maximum") || schema.contains("exclusiveMaximum"))) { int64_t min_value = std::numeric_limits::min(); @@ -990,7 +990,7 @@ class common_schema_converter { std::stringstream out; out << "("; build_min_max_int(min_value, max_value, out); - out << ") space"; + out << ")"; return _add_rule(rule_name, out.str()); } if (schema.empty() || schema_type == "object") { diff --git a/common/log.cpp b/common/log.cpp index bd62616d8a19..2d1e74ad1fe3 100644 --- a/common/log.cpp +++ b/common/log.cpp @@ -11,8 +11,13 @@ #include #include #include +#include #if defined(_WIN32) +# define WIN32_LEAN_AND_MEAN +# ifndef NOMINMAX +# define NOMINMAX +# endif # include # include # define isatty _isatty @@ -62,16 +67,15 @@ static const char* g_col[] = { }; struct common_log_entry { - enum ggml_log_level level; - - bool prefix; - - int64_t timestamp; + enum ggml_log_level level {GGML_LOG_LEVEL_INFO}; std::vector msg; - // signals the worker thread to stop - bool is_end; + int64_t timestamp { 0 }; + bool is_end { false }; // signals the worker thread to stop + bool prefix { false }; + + common_log_entry(size_t size = 256) : msg(size) { } void print(FILE * file = nullptr) const { FILE * fcur = file; @@ -122,22 +126,15 @@ struct common_log_entry { }; struct common_log { - // default capacity - will be expanded if needed - common_log() : common_log(256) {} - - common_log(size_t capacity) { - file = nullptr; - prefix = false; + // default capacity + common_log(size_t capacity = 512) { + file = nullptr; + prefix = false; timestamps = false; - running = false; - t_start = t_us(); - - // initial message size - will be expanded if longer messages arrive - entries.resize(capacity); - for (auto & entry : entries) { - entry.msg.resize(256); - } + running = false; + t_start = t_us(); + queue.resize(capacity, common_log_entry(256)); head = 0; tail = 0; @@ -152,9 +149,10 @@ struct common_log { } private: - std::mutex mtx; - std::thread thrd; - std::condition_variable cv; + std::mutex mtx; + std::thread thrd; + std::condition_variable cv_new; // new entry + std::condition_variable cv_full; // wait on full FILE * file; @@ -164,24 +162,53 @@ struct common_log { int64_t t_start; - // ring buffer of entries - std::vector entries; + // queue of entries + std::vector queue; size_t head; size_t tail; - // worker thread copies into this - common_log_entry cur; + bool print_entry(const common_log_entry & e) const { + if (e.is_end) return true; + + e.print(); + if (file) { + e.print(file); + } + return false; + } + + bool flush_queue(size_t start_head, size_t end_tail, size_t & out_head) const { + bool stop = false; + size_t h = start_head; + while (h != end_tail && !stop) { + stop = print_entry(queue[h]); + h = (h + 1) % queue.size(); + } + out_head = h; + return stop; + } public: + bool is_full() const { + return ((tail + 1) % queue.size()) == head; + } + + bool is_empty() const { + return head == tail; + } + void add(enum ggml_log_level level, const char * fmt, va_list args) { - std::lock_guard lock(mtx); + std::unique_lock lock(mtx); + + // block if the queue is full + cv_full.wait(lock, [this]() { return !running || !is_full(); }); if (!running) { // discard messages while the worker thread is paused return; } - auto & entry = entries[tail]; + auto & entry = queue[tail]; { // cannot use args twice, so make a copy in case we need to expand the buffer @@ -216,38 +243,16 @@ struct common_log { va_end(args_copy); } - entry.level = level; - entry.prefix = prefix; + entry.is_end = false; + entry.level = level; + entry.prefix = prefix; entry.timestamp = 0; if (timestamps) { entry.timestamp = t_us() - t_start; } - entry.is_end = false; - - tail = (tail + 1) % entries.size(); - if (tail == head) { - // expand the buffer - std::vector new_entries(2*entries.size()); - - size_t new_tail = 0; - do { - new_entries[new_tail] = std::move(entries[head]); - - head = (head + 1) % entries.size(); - new_tail = (new_tail + 1); - } while (head != tail); - - head = 0; - tail = new_tail; - - for (size_t i = tail; i < new_entries.size(); i++) { - new_entries[i].msg.resize(256); - } - - entries = std::move(new_entries); - } - cv.notify_one(); + tail = (tail + 1) % queue.size(); + cv_new.notify_one(); } void resume() { @@ -261,22 +266,23 @@ struct common_log { thrd = std::thread([this]() { while (true) { - { - std::unique_lock lock(mtx); - cv.wait(lock, [this]() { return head != tail; }); - cur = entries[head]; + std::unique_lock lock(mtx); + cv_new.wait(lock, [this]() { return !is_empty(); }); - head = (head + 1) % entries.size(); - } + size_t cached_head = head; + size_t cached_tail = tail; - if (cur.is_end) { - break; - } + lock.unlock(); // drop the lock during flush + + size_t next_head; + bool stop = flush_queue(cached_head, cached_tail, next_head); - cur.print(); // stdout and stderr + lock.lock(); + head = next_head; + cv_full.notify_all(); - if (file) { - cur.print(file); + if (stop) { + break; } } }); @@ -293,13 +299,13 @@ struct common_log { running = false; // push an entry to signal the worker thread to stop - { - auto & entry = entries[tail]; - entry.is_end = true; + auto & entry = queue[tail]; + entry.is_end = true; + tail = (tail + 1) % queue.size(); - tail = (tail + 1) % entries.size(); - } - cv.notify_one(); + // wakeup everyone + cv_new.notify_one(); + cv_full.notify_all(); } thrd.join(); diff --git a/common/peg-parser.cpp b/common/peg-parser.cpp index e37c1ce80e1d..807e952d902c 100644 --- a/common/peg-parser.cpp +++ b/common/peg-parser.cpp @@ -6,13 +6,14 @@ #include "unicode.h" #include +#include #include #include #include #include #include +#include #include -#include // Trick to catch missing branches template @@ -88,40 +89,7 @@ struct trie { return match_result{match_result::NO_MATCH}; } - struct prefix_and_next { - std::vector prefix; - std::vector next_chars; - }; - - std::vector collect_prefix_and_next() { - std::vector prefix; - std::vector result; - collect_prefix_and_next(0, prefix, result); - return result; - } - private: - void collect_prefix_and_next(size_t index, std::vector & prefix, std::vector & out) { - if (!nodes[index].is_word) { - if (!nodes[index].children.empty()) { - std::vector chars; - chars.reserve(nodes[index].children.size()); - for (const auto & p : nodes[index].children) { - chars.push_back(p.first); - } - out.emplace_back(prefix_and_next{prefix, chars}); - } - } - - for (const auto & p : nodes[index].children) { - uint32_t ch = p.first; - auto child = p.second; - prefix.push_back(ch); - collect_prefix_and_next(child, prefix, out); - prefix.pop_back(); - } - } - size_t create_node() { size_t index = nodes.size(); nodes.emplace_back(); @@ -153,6 +121,65 @@ struct trie { } }; +// Aho-Corasick automaton +struct aho_corasick { + trie t; + std::vector fail; // failure links + std::vector order; // states in BFS order + std::vector terminal; // match states (directly or via a suffix link) + std::set alphabet; // every character with a transition + + aho_corasick(const std::vector & strings) : t(strings) { + const auto & nodes = t.nodes; + const size_t n = nodes.size(); + + fail.assign(n, 0); + order.reserve(n); + + std::deque queue{ 0 }; + while (!queue.empty()) { + size_t u = queue.front(); + queue.pop_front(); + order.push_back(u); + for (const auto & [ch, v] : nodes[u].children) { + if (u != 0) { + size_t f = fail[u]; + while (f && nodes[f].children.find(ch) == nodes[f].children.end()) { + f = fail[f]; + } + auto it = nodes[f].children.find(ch); + fail[v] = (it != nodes[f].children.end() && it->second != v) ? it->second : 0; + } + queue.push_back(v); + } + } + + terminal.assign(n, false); + for (size_t u : order) { + terminal[u] = nodes[u].is_word || (u != 0 && terminal[fail[u]]); + } + + for (const auto & node : nodes) { + for (const auto & [ch, v] : node.children) { + alphabet.insert(ch); + } + } + } + + size_t num_states() const { return t.nodes.size(); } + bool is_terminal(size_t s) const { return terminal[s]; } + + // follow failure links until a transition on `ch` exists. + size_t next(size_t state, uint32_t ch) const { + const auto & nodes = t.nodes; + while (state && nodes[state].children.find(ch) == nodes[state].children.end()) { + state = fail[state]; + } + auto it = nodes[state].children.find(ch); + return it != nodes[state].children.end() ? it->second : 0; + } +}; + static std::pair parse_hex_escape(const std::string & str, size_t pos, int hex_count) { if (pos + hex_count > str.length()) { return {0, 0}; @@ -894,6 +921,10 @@ struct parser_executor { common_peg_parse_result operator()(const common_peg_gbnf_parser & p) { return arena.parse(p.child, ctx, start_pos); } + + common_peg_parse_result operator()(const common_peg_ac_parser & p) { + return arena.parse(p.child, ctx, start_pos); + } }; common_peg_parse_result common_peg_arena::parse(common_peg_parse_context & ctx, size_t start) const { @@ -962,7 +993,8 @@ void common_peg_arena::resolve_refs() { std::is_same_v || std::is_same_v || std::is_same_v || - std::is_same_v) { + std::is_same_v || + std::is_same_v) { p.child = resolve_ref(p.child); } else if constexpr (std::is_same_v) { p.child = resolve_ref(p.child); @@ -992,12 +1024,12 @@ void common_peg_arena::resolve_refs() { } std::string common_peg_arena::dump(common_peg_parser_id id) const { - std::unordered_set visited; + std::set visited; return dump_impl(id, visited); } std::string common_peg_arena::dump_impl(common_peg_parser_id id, - std::unordered_set & visited) const { + std::set & visited) const { // Check for cycles if (visited.count(id)) { return "[cycle]"; @@ -1043,6 +1075,8 @@ std::string common_peg_arena::dump_impl(common_peg_parser_id return "Atomic(" + dump_impl(p.child, visited) + ")"; } else if constexpr (std::is_same_v) { return "Gbnf(" + p.grammar + ", " + dump_impl(p.child, visited) + ")"; + } else if constexpr (std::is_same_v) { + return "Ac(" + string_join(p.delimiters, " | ") + ", " + dump_impl(p.child, visited) + ")"; } else if constexpr (std::is_same_v) { return "Any"; } else if constexpr (std::is_same_v) { @@ -1272,13 +1306,13 @@ common_peg_parser common_peg_parser_builder::string_content(char delimiter) { common_peg_parser common_peg_parser_builder::double_quoted_string() { return rule("double-quoted-string", [this]() { - return sequence({literal("\""), string_content('"'), literal("\""), space()}); + return sequence({literal("\""), string_content('"'), literal("\"")}); }); } common_peg_parser common_peg_parser_builder::single_quoted_string() { return rule("single-quoted-string", [this]() { - return sequence({literal("'"), string_content('\''), literal("'"), space()}); + return sequence({literal("'"), string_content('\''), literal("'")}); }); } @@ -1301,25 +1335,25 @@ common_peg_parser common_peg_parser_builder::json_number() { // At EOF in partial mode, chars returns NEED_MORE → negate propagates NEED_MORE → number not committed. // This prevents premature commits of partial numbers (e.g. "3" when "3.14" is incoming). auto not_number_continuation = negate(chars("[0-9.eE+-]", 1, 1)); - return sequence({ optional(literal("-")), int_part, optional(frac), optional(exp), not_number_continuation, space() }); + return sequence({ optional(literal("-")), int_part, optional(frac), optional(exp), not_number_continuation }); }); } common_peg_parser common_peg_parser_builder::json_string() { return rule("json-string", [this]() { - return sequence({literal("\""), string_content('"'), literal("\""), space()}); + return sequence({literal("\""), string_content('"'), literal("\"")}); }); } common_peg_parser common_peg_parser_builder::json_bool() { return rule("json-bool", [this]() { - return sequence({choice({literal("true"), literal("false")}), space()}); + return choice({literal("true"), literal("false")}); }); } common_peg_parser common_peg_parser_builder::json_null() { return rule("json-null", [this]() { - return sequence({literal("null"), space()}); + return literal("null"); }); } @@ -1334,8 +1368,7 @@ common_peg_parser common_peg_parser_builder::json_object() { choice({ literal("}"), sequence({members, ws, literal("}")}) - }), - ws + }) }); }); } @@ -1343,15 +1376,14 @@ common_peg_parser common_peg_parser_builder::json_object() { common_peg_parser common_peg_parser_builder::json_array() { return rule("json-array", [this]() { auto ws = space(); - auto elements = sequence({json(), zero_or_more(sequence({literal(","), ws, json()}))}); + auto elements = sequence({json(), zero_or_more(sequence({ws, literal(","), ws, json()}))}); return sequence({ literal("["), ws, choice({ literal("]"), sequence({elements, ws, literal("]")}) - }), - ws + }) }); }); } @@ -1381,16 +1413,13 @@ common_peg_parser common_peg_parser_builder::python_number() { common_peg_parser common_peg_parser_builder::python_bool() { return rule("python-bool", [this]() { - return sequence({ - choice({literal("True"), literal("False")}), - space() - }); + return choice({literal("True"), literal("False")}); }); } common_peg_parser common_peg_parser_builder::python_null() { return rule("python-none", [this]() { - return sequence({literal("None"), space()}); + return literal("None"); }); } @@ -1457,6 +1486,13 @@ common_peg_parser common_peg_parser_builder::json_member(const std::string & key }); } +common_peg_parser common_peg_parser_builder::ac(const common_peg_parser & p, const std::vector & delimiters) { + if (delimiters.empty()) { + throw std::runtime_error("ac parser requires at least one delimiter"); + } + return add(common_peg_ac_parser{p, delimiters}); +} + static std::string gbnf_escape_char_class(uint32_t c) { if (c == '-' || c == ']' || c == '[' || c == '\\') { return "\\" + std::string(1, (char) c); @@ -1507,41 +1543,118 @@ static std::string gbnf_escape_char_class(uint32_t c) { return std::string(buf); } -static std::string gbnf_excluding_pattern(const std::vector & strings) { - trie matcher(strings); - auto pieces = matcher.collect_prefix_and_next(); - - std::string pattern; - for (size_t i = 0; i < pieces.size(); ++i) { - if (i > 0) { - pattern += " | "; - } +static std::string gbnf_char_class(const std::vector & chars, bool negate) { + std::string s = negate ? "[^" : "["; + for (uint32_t ch : chars) { + s += gbnf_escape_char_class(ch); + } + return s + "]"; +} - const auto & pre = pieces[i].prefix; - const auto & chars = pieces[i].next_chars; +static std::string gbnf_ac_grammar( + const common_grammar_builder & builder, + const std::string & prefix, + const std::vector & strings, + const std::function &, + const std::map> &, + const std::vector &, + const std::function &)> & build_rule) { + aho_corasick ac(strings); - std::string cls; - cls.reserve(chars.size()); - for (uint32_t ch : chars) { - cls += gbnf_escape_char_class(ch); + auto state_name = [&](size_t s) -> std::string { + if (s == 0) { + return prefix; } + std::string num = std::to_string(s); + num = num.size() == 1 ? ("0" + num) : num; + return prefix + "-" + num; + }; - if (!pre.empty()) { - pattern += gbnf_format_literal(common_unicode_cpts_to_utf8(pre)) + " [^" + cls + "]"; - } else { - pattern += "[^" + cls + "]"; + for (size_t q = 0; q < ac.num_states(); q++) { + if (ac.is_terminal(q)) { + continue; // match states + } + + std::map> buckets; + std::vector completing; // chars that complete a delimiter + std::vector specific; // chars with an explicit transition + for (uint32_t c : ac.alphabet) { + size_t d = ac.next(q, c); + if (ac.is_terminal(d)) { + completing.push_back(c); + specific.push_back(c); + } else if (d != 0) { + buckets[d].push_back(c); // specific non-root destination + specific.push_back(c); + } } + + builder.add_rule(state_name(q), build_rule(completing, buckets, specific, state_name)); } - return "(" + pattern + ")*"; + // An empty delimiter makes the start state terminal. Emit an entry rule + // that matches the empty string so the returned reference stays valid. + if (ac.is_terminal(0)) { + builder.add_rule(prefix, "|"); + } + + return state_name(0); +} + +// GBNF grammar matching strings that contain no string in `strings` as a +// substring. Emits the complement of an Aho-Corasick automaton DFA and returns +// the start state rule name. +// +// ref: https://github.com/ggml-org/llama.cpp/pull/24839 +static std::string gbnf_excluding_grammar(const common_grammar_builder & builder, + const std::string & prefix, + const std::vector & strings) { + return gbnf_ac_grammar(builder, prefix, strings, + [](const std::vector & /*completing*/, + const std::map> & buckets, + const std::vector & specific, + const std::function & state_name) { + // every state is accepting and completing chars get no + // alternative, so a forbidden string can never be matched + std::string rhs = "|"; + for (const auto & [d, chars] : buckets) { + rhs += " " + gbnf_char_class(chars, false) + " " + state_name(d) + " |"; + } + rhs += " " + gbnf_char_class(specific, true) + " " + state_name(0); + return rhs; + }); +} + +// GBNF grammar matching everything up to and including the first occurrence of +// any string in `strings`. Emits the Aho-Corasick automaton DFA and returns +// the start state rule name. +static std::string gbnf_including_grammar(const common_grammar_builder & builder, + const std::string & prefix, + const std::vector & strings) { + return gbnf_ac_grammar(builder, prefix, strings, + [](const std::vector & completing, + const std::map> & buckets, + const std::vector & specific, + const std::function & state_name) { + std::vector alts; + if (!completing.empty()) { + alts.push_back(gbnf_char_class(completing, false)); // terminate on match + } + for (const auto & [d, chars] : buckets) { + alts.push_back(gbnf_char_class(chars, false) + " " + state_name(d)); + } + // every other character keeps scanning from the start state + alts.push_back(gbnf_char_class(specific, true) + " " + state_name(0)); + return string_join(alts, " | "); + }); } -static std::unordered_set collect_reachable_rules( +static std::set collect_reachable_rules( const common_peg_arena & arena, const common_peg_parser_id & rule ) { - std::unordered_set reachable; - std::unordered_set visited; + std::set reachable; + std::set visited; std::function visit = [&](common_peg_parser_id id) { const auto & parser = arena.get(id); @@ -1573,6 +1686,7 @@ static std::unordered_set collect_reachable_rules( std::is_same_v || std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v) { visit(p.child); } else if constexpr (std::is_same_v) { @@ -1750,7 +1864,7 @@ void common_peg_arena::build_grammar(const common_grammar_builder & builder, boo if (p.delimiters.empty()) { return ".*"; } - return gbnf_excluding_pattern(p.delimiters); + return gbnf_excluding_grammar(builder, "until-" + std::to_string(id), p.delimiters); } else if constexpr (std::is_same_v) { if (schema_delegates(p)) { return to_gbnf(p.child); @@ -1767,6 +1881,8 @@ void common_peg_arena::build_grammar(const common_grammar_builder & builder, boo return to_gbnf(p.child); } else if constexpr (std::is_same_v) { return p.grammar; + } else if constexpr (std::is_same_v) { + return gbnf_including_grammar(builder, "ac-" + std::to_string(id), p.delimiters); } else { static_assert(is_always_false_v); } @@ -1774,7 +1890,7 @@ void common_peg_arena::build_grammar(const common_grammar_builder & builder, boo }; // Collect reachable rules - std::unordered_set reachable_rules; + std::set reachable_rules; if (lazy) { // Collect rules reachable from trigger rules @@ -1903,6 +2019,8 @@ static nlohmann::json serialize_parser_variant(const common_peg_parser_variant & }; } else if constexpr (std::is_same_v) { return json{{"type", "gbnf"}, {"child", p.child}, {"grammar", p.grammar}}; + } else if constexpr (std::is_same_v) { + return json{{"type", "ac"}, {"child", p.child}, {"delimiters", p.delimiters}}; } }, variant); } @@ -2075,6 +2193,16 @@ static common_peg_parser_variant deserialize_parser_variant(const nlohmann::json }; } + if (type == "ac") { + if (!j.contains("child") || !j.contains("delimiters") || !j["delimiters"].is_array() || j["delimiters"].empty()) { + throw std::runtime_error("ac parser requires 'child' and a non-empty 'delimiters' array"); + } + return common_peg_ac_parser{ + j["child"].get(), + j["delimiters"].get>(), + }; + } + throw std::runtime_error("Unknown parser type: " + type); } diff --git a/common/peg-parser.h b/common/peg-parser.h index b6bb05214bb2..c198499dd934 100644 --- a/common/peg-parser.h +++ b/common/peg-parser.h @@ -3,8 +3,8 @@ #include #include +#include #include -#include #include #include #include @@ -275,6 +275,11 @@ struct common_peg_gbnf_parser { std::string grammar; }; +struct common_peg_ac_parser { + common_peg_parser_id child; + std::vector delimiters; +}; + // Variant holding all parser types using common_peg_parser_variant = std::variant< common_peg_epsilon_parser, @@ -296,7 +301,8 @@ using common_peg_parser_variant = std::variant< common_peg_ref_parser, common_peg_atomic_parser, common_peg_tag_parser, - common_peg_gbnf_parser + common_peg_gbnf_parser, + common_peg_ac_parser >; class common_peg_arena { @@ -335,7 +341,7 @@ class common_peg_arena { friend class common_peg_parser_builder; private: - std::string dump_impl(common_peg_parser_id id, std::unordered_set & visited) const; + std::string dump_impl(common_peg_parser_id id, std::set & visited) const; common_peg_parser_id add_parser(common_peg_parser_variant parser); void add_rule(const std::string & name, common_peg_parser_id id); @@ -514,6 +520,13 @@ class common_peg_parser_builder { // the child's grammar. Parsing delegates entirely to the child. common_peg_parser gbnf(const common_peg_parser & p, const std::string & grammar) { return add(common_peg_gbnf_parser{p, grammar}); } + // Wraps a child parser but emits a GBNF grammar built from the Aho-Corasick + // automaton of `delimiters`, matching everything up to and including the + // first delimiter. Parsing delegates entirely to the child, which is + // responsible for consuming the delimiter (e.g. until(D) + literal(D)). + common_peg_parser ac(const common_peg_parser & p, const std::vector & delimiters); + common_peg_parser ac(const common_peg_parser & p, const std::string & delimiter) { return ac(p, std::vector{delimiter}); } + void set_root(const common_peg_parser & p); common_peg_arena build(); diff --git a/common/preset.cpp b/common/preset.cpp index 51ea984d8cdf..f0cc1fa1a242 100644 --- a/common/preset.cpp +++ b/common/preset.cpp @@ -16,48 +16,6 @@ static std::string rm_leading_dashes(const std::string & str) { return str.substr(pos); } -// only allow a subset of args for remote presets for security reasons -// do not add more args unless absolutely necessary -// args that output to files are strictly prohibited -static std::set get_remote_preset_whitelist(const std::map & key_to_opt) { - static const std::set allowed_options = { - "model-url", - "hf-repo", - "hf-repo-draft", - "hf-repo-v", // vocoder - "hf-file-v", // vocoder - "mmproj-url", - "pooling", - "jinja", - "batch-size", - "ubatch-size", - "cache-reuse", - "chat-template-kwargs", - "mmap", - // note: sampling params are automatically allowed by default - // negated args will be added automatically if the positive arg is specified above - }; - - std::set allowed_keys; - - for (const auto & it : key_to_opt) { - const std::string & key = it.first; - const common_arg & opt = it.second; - if (allowed_options.find(key) != allowed_options.end() || opt.is_sampling) { - allowed_keys.insert(key); - // also add variant keys (args without leading dashes and env vars) - for (const auto & arg : opt.get_args()) { - allowed_keys.insert(rm_leading_dashes(arg)); - } - for (const auto & env : opt.get_env()) { - allowed_keys.insert(env); - } - } - } - - return allowed_keys; -} - std::vector common_preset::to_args(const std::string & bin_path) const { std::vector args; @@ -300,16 +258,10 @@ static std::string parse_bool_arg(const common_arg & arg, const std::string & ke return value; } -common_preset_context::common_preset_context(llama_example ex, bool only_remote_allowed) +common_preset_context::common_preset_context(llama_example ex) : ctx_params(common_params_parser_init(default_params, ex)) { common_params_add_preset_options(ctx_params.options); key_to_opt = get_map_key_opt(ctx_params); - - // setup allowed keys if only_remote_allowed is true - if (only_remote_allowed) { - filter_allowed_keys = true; - allowed_keys = get_remote_preset_whitelist(key_to_opt); - } } common_presets common_preset_context::load_from_ini(const std::string & path, common_preset & global) const { diff --git a/common/preset.h b/common/preset.h index 06f829c3e5e9..52935ebde86e 100644 --- a/common/preset.h +++ b/common/preset.h @@ -60,7 +60,7 @@ struct common_preset_context { std::set allowed_keys; // if only_remote_allowed is true, only accept whitelisted keys - common_preset_context(llama_example ex, bool only_remote_allowed = false); + common_preset_context(llama_example ex); // load presets from INI file common_presets load_from_ini(const std::string & path, common_preset & global) const; diff --git a/common/sampling.cpp b/common/sampling.cpp index c537f335039c..75a299e23ece 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -259,6 +259,9 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st } } } + if (!grmr && !grammar_str.empty()) { + throw std::runtime_error("failed to parse grammar"); + } // Compute prefill tokens from the generation prompt std::vector prefill_tokens; diff --git a/common/speculative.cpp b/common/speculative.cpp index 653f932c90b5..c922a3f592a6 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -140,6 +140,8 @@ struct common_speculative_impl { size_t n_gen_tokens = 0; // number of tokens generated by this implementation. size_t n_acc_tokens = 0; // number of tokens accepted by the target model. + std::vector n_acc_tokens_per_pos; // number of tokens accepted per draft position. + // TODO: track performance of most recent calls const bool gen_perf = true; // whether to generate performance stats. @@ -159,6 +161,10 @@ struct common_speculative_impl { virtual void accept(llama_seq_id seq_id, uint16_t n_accepted, bool is_other) = 0; + // (optional) serialize/restore per-seq internal state (e.g. eagle3's deferred boundary). + virtual bool get_state(llama_seq_id /*seq_id*/, std::vector & /*data*/) const { return false; } + virtual void set_state(llama_seq_id /*seq_id*/, const std::vector & /*data*/) {} + // true if this implementation requires the target context to extract post-norm embeddings virtual bool need_embd() const = 0; @@ -375,31 +381,511 @@ struct common_speculative_impl_draft_simple : public common_speculative_impl { } }; + +// EAGLE3 speculative decoding state +// +// Input of draft decoder: (This is different compared to MTP) +// At "pos P", the decoder takes input pair (t_{P+1}, g_P), with RoPE at P. +// - t_{P+1} = token at sequence pos P+1 (the *next* token after P) +// - g_P = encoder output = projection of target's extracted hidden states at P +// +// Deferred boundary (MTP doesn't have this issue): +// Within a single process() call with n_tokens, we can only write decoder KV for +// training pos 0..n_tokens-2. The last training pos (n_tokens-1) needs t_{n_tokens} +// which lies *outside* this batch — it is the token target will sample next or the first token from next ubatch. +// So the last training pos of each process() call is *deferred* to whichever next call has +// the missing token in hand: +// - multi-ubatch prefill: the next process()'s first token completes the pair +// (handled by the per-seq "cross-ubatch bridge") +// - single-ubatch prefill / after verify: draft()'s seed step uses "dp.id_last" +// (target's freshest sample) to complete the pair +// +// Per-seq carry-over state: +// pending_g_last [n_embd_dec] ┐ the deferred boundary's (g, pos). Set by +// pending_pos_last llama_pos ┘ process() at end of ubatch (= last row); +// rebased by accept() to first-non-accepted pos. +// verify_g [N × n_embd_dec] snapshot of process()'s encoder output; +// verify_pos_first llama_pos consumed by accept() to recover the right +// verify_g_rows int32_t pending_g_last row for any n_accepted value. +// +// Performance is overall good but there is waste in verify cycle: +// process() runs encoder + decoder on the *full* verify batch including rows for +// rejected drafts. The KV at those positions is then dropped. +// +// TODO: Not sure if we need optimization for this waste? +// If so we may need hybrid stash: +// in verify mode, have process() only stash features and let draft() seed run +// encoder+decoder on n_accepted+1 rows). struct common_speculative_impl_draft_eagle3 : public common_speculative_impl { - //common_params_speculative_eagle3 params; + common_params_speculative_draft params; + llama_batch batch; + + std::vector smpls; + + // backend sampler chain per seq, attached to ctx_dft + std::vector backend_chains; + + int32_t n_embd_dec = 0; // draft hidden size + int32_t n_embd_enc = 0; // target_layer_ids_n * target_hidden_size + int32_t n_embd_tgt = 0; // target model hidden size + + const int32_t * target_layer_ids = nullptr; // model_dft's extract layer indices + uint32_t target_layer_ids_n = 0; + + // [per-seq] deferred boundary state + std::vector> pending_g_last; + std::vector pending_pos_last; + + // [per-seq] snapshot of the most recent process()'s encoder output + std::vector> verify_g; // [n_seq][n_rows * n_embd_dec] + std::vector verify_pos_first; // [n_seq] — pos of verify_g[seq][0] + std::vector verify_g_rows; // [n_seq] — number of rows + + // scratch buffer for concatenated target features [n_tokens, n_embd_enc] + std::vector features_buf; + std::vector g_embd_buf; common_speculative_impl_draft_eagle3(const common_params_speculative & params, uint32_t n_seq) : common_speculative_impl(COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3, n_seq) + , params(params.draft) { LOG_INF("%s: adding speculative implementation 'draft-eagle3'\n", __func__); - LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%f\n", __func__, params.draft.n_max, params.draft.n_min, params.draft.p_min); + LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%f, backend_sampling=%d\n", __func__, params.draft.n_max, params.draft.n_min, params.draft.p_min, (int) params.draft.backend_sampling); + + auto * ctx_tgt = this->params.ctx_tgt; + auto * ctx_dft = this->params.ctx_dft; + GGML_ASSERT(ctx_tgt && ctx_dft && "EAGLE3 requires ctx_tgt and ctx_dft to be set"); + + const llama_model * model_dft = llama_get_model(ctx_dft); + const llama_model * model_tgt = llama_get_model(ctx_tgt); + + target_layer_ids = llama_model_target_layer_ids (model_dft); + target_layer_ids_n = llama_model_target_layer_ids_n(model_dft); + if (target_layer_ids_n != 3) { + throw std::runtime_error("draft model is not eagle3 (expected 3 extract layers, got " + + std::to_string(target_layer_ids_n) + ")"); + } + + n_embd_tgt = llama_model_n_embd(model_tgt); + n_embd_dec = llama_model_n_embd(model_dft); + n_embd_enc = (int32_t) target_layer_ids_n * n_embd_tgt; + + const int32_t n_b = (int32_t) llama_n_batch(ctx_dft); + batch = llama_batch_init(/*n_tokens=*/ n_b, /*embd=*/ n_embd_dec, /*n_seq_max=*/ 1); + // llama_batch_init allocates only one of token/embd; eagle3 decoder needs both. + // TODO: fix, how to call without malloc + batch.token = (llama_token *) malloc(sizeof(llama_token) * n_b); + + smpls.resize(n_seq); + for (auto & s : smpls) { + common_params_sampling sparams; + sparams.no_perf = false; + sparams.top_k = 10; + sparams.samplers = { COMMON_SAMPLER_TYPE_TOP_K }; + s.reset(common_sampler_init(llama_get_model(ctx_dft), sparams)); + } + + // offload draft sampling to the backend + backend_chains.assign(n_seq, nullptr); + if (this->params.backend_sampling) { + for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) { + llama_sampler * chain = llama_sampler_chain_init(llama_sampler_chain_default_params()); + llama_sampler_chain_add(chain, llama_sampler_init_top_k(10)); + + if (!llama_set_sampler(ctx_dft, seq_id, chain)) { + LOG_WRN("%s: backend offload failed for seq_id=%d; using CPU sampler\n", __func__, (int) seq_id); + llama_sampler_free(chain); + chain = nullptr; + } + backend_chains[seq_id] = chain; + } + } + + // turn on extraction of the target layers' input embeddings + for (uint32_t k = 0; k < target_layer_ids_n; ++k) { + llama_set_embeddings_layer_inp(ctx_tgt, (uint32_t) target_layer_ids[k], true); + } + + // turn on extraction of the draft model's pre-norm hidden state + // (used both for the encoder output g_embd and the decoder pre-norm output). + llama_set_embeddings_nextn(ctx_dft, true, /*masked*/ true); + + pending_g_last.assign(n_seq, std::vector(n_embd_dec, 0.0f)); + pending_pos_last.assign(n_seq, -1); + + verify_g.assign(n_seq, std::vector()); + verify_pos_first.assign(n_seq, -1); + verify_g_rows.assign(n_seq, 0); } - void begin(llama_seq_id /*seq_id*/, const llama_tokens & /*prompt*/) override { - // noop + ~common_speculative_impl_draft_eagle3() override { + auto * ctx_dft = this->params.ctx_dft; + for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) backend_chains.size(); ++seq_id) { + if (backend_chains[seq_id] == nullptr) { + continue; + } + if (ctx_dft) { + llama_set_sampler(ctx_dft, seq_id, nullptr); + } + llama_sampler_free(backend_chains[seq_id]); + } + backend_chains.clear(); + + if (batch.token != nullptr) { + free(batch.token); + batch.token = nullptr; + } + llama_batch_free(batch); } - bool process(const llama_batch & /*batch*/) override { - // TODO: implement + void begin(llama_seq_id seq_id, const llama_tokens & prompt) override { + const int32_t N = (int32_t) prompt.size(); + if (N <= 0) { + return; + } + // expected state after prefill: ctx_dft has pos 0..N-2 (last position is deferred to + // draft()'s seed step). Warn only if more than one position is missing. + auto * ctx_dft = this->params.ctx_dft; + const llama_pos pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_dft), seq_id); + if (pos_max < N - 2) { + LOG_WRN("%s: ctx_dft pos_max=%d < N-2=%d — process() did not run on every prefill ubatch. " + "Drafts may degrade.\n", + __func__, (int) pos_max, N - 2); + } + } + + bool process(const llama_batch & batch_in) override { + if (batch_in.n_tokens <= 0) { + return true; + } + + if (batch_in.token == nullptr || batch_in.embd != nullptr) { + return true; + } + + const int32_t n_tokens = batch_in.n_tokens; + + // i_batch_beg[seq] / i_batch_end[seq]: inclusive batch indices of this seq's + // first/last token in batch_in. Assumes per-seq tokens are contiguous within + // the ubatch (server's default ordering). + std::vector i_batch_beg(n_seq, -1); + std::vector i_batch_end(n_seq, -1); + for (int k = 0; k < n_tokens; ++k) { + GGML_ASSERT(batch_in.n_seq_id[k] == 1); + const llama_seq_id seq_id = batch_in.seq_id[k][0]; + if (seq_id < 0 || seq_id >= (llama_seq_id) n_seq) { + continue; + } + i_batch_end[seq_id] = k; + if (i_batch_beg[seq_id] < 0) { + i_batch_beg[seq_id] = k; + } + } + + auto * ctx_tgt = this->params.ctx_tgt; + auto * ctx_dft = this->params.ctx_dft; + + // Interleave each extract_layer's hidden state into a contiguous buffer of + // shape [n_tokens, target_layer_ids_n * n_embd_tgt]. Then run EAGLE3 encoder + // to get one g_embd row per token. + features_buf.resize((size_t) n_tokens * n_embd_enc, 0.0f); + + for (uint32_t k = 0; k < target_layer_ids_n; ++k) { + const float * layer = llama_get_embeddings_layer_inp(ctx_tgt, (uint32_t) target_layer_ids[k]); + if (!layer) { + GGML_ABORT("EAGLE3: target layer %d input not extracted.", target_layer_ids[k]); + } + for (int32_t i = 0; i < n_tokens; ++i) { + float * dst = features_buf.data() + (size_t) i * n_embd_enc + k * (size_t) n_embd_tgt; + const float * src = layer + (size_t) i * n_embd_tgt; + std::memcpy(dst, src, (size_t) n_embd_tgt * sizeof(float)); + } + } + + g_embd_buf.resize((size_t) n_tokens * n_embd_dec); + + // llama_encode() requires the full encoder batch to fit in n_ubatch. + // Allow batch > ubatch: eagle3's per-token encoder can be chunked safely. + const int32_t n_ubatch_dft = (int32_t) llama_n_ubatch(ctx_dft); + for (int32_t i = 0; i < n_tokens; i += n_ubatch_dft) { + const int32_t n_chunk = std::min(n_ubatch_dft, n_tokens - i); + + llama_batch enc_batch = { + /*.n_tokens =*/ n_chunk, + /*.token =*/ nullptr, + /*.embd =*/ features_buf.data() + (size_t) i * n_embd_enc, + /*.pos =*/ nullptr, + /*.n_seq_id =*/ nullptr, + /*.seq_id =*/ nullptr, + /*.logits =*/ nullptr, + }; + const int32_t rc = llama_encode(ctx_dft, enc_batch); + if (rc != 0) { + LOG_ERR("%s: llama_encode(ctx_dft) failed rc=%d (n_tokens=%d, offset=%d)\n", + __func__, rc, (int) n_chunk, (int) i); + return false; + } + + // g_embd has shape [n_chunk, n_embd_dec] in ctx_dft's pre-norm embeddings buffer. + const float * g_embd_chunk = llama_get_embeddings_nextn(ctx_dft); + GGML_ASSERT(g_embd_chunk && "EAGLE3 encoder produced no output."); + std::memcpy(g_embd_buf.data() + (size_t) i * n_embd_dec, + g_embd_chunk, + (size_t) n_chunk * n_embd_dec * sizeof(float)); + } + + const float * g_embd = g_embd_buf.data(); + + const size_t row_bytes = (size_t) n_embd_dec * sizeof(float); + + // EAGLE3 decoder input convention: at memory pos P the input pair is + // (token[P+1], g_embd[P]). This shifts the token index "left by one" relative to g_embd. + // + // Per seq, in order: + // (a) cross-ubatch bridge — when applicable, write the previously-deferred + // pos using this ubatch's first token + pending_g_last. + // (b) main write loop — for k in [beg, end-1], write (token[k+1], g_embd[k]) + // at pos[k]. The last training pos (k=end) is left unwritten = new + // deferred boundary, completed by the next process() or draft() call. + // (c) refresh deferred state — stash this ubatch's full g_embd into verify_g, + // update pending_g_last / pending_pos_last to the last row. + common_batch_clear(batch); + + for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) { + const int32_t beg = i_batch_beg[seq_id]; + const int32_t end = i_batch_end[seq_id]; + if (beg < 0 || end < 0) { + continue; + } + + // cross-ubatch bridge — complete the prior ubatch's deferred boundary. + // Fires iff all three preconditions hold: + // 1) pending_pos_last >= 0 + // 2) pending_pos_last + 1 == pos[beg] + // 3) pending_pos_last > dft_pos_max // TODO: is this check needed? + const llama_pos pending_pos = pending_pos_last[seq_id]; + if (pending_pos >= 0 && pending_pos + 1 == batch_in.pos[beg]) { + const llama_pos dft_pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_dft), seq_id); + if (pending_pos > dft_pos_max) { + common_batch_add(batch, batch_in.token[beg], pending_pos, { seq_id }, /*logits=*/ false); + std::memcpy(batch.embd + (size_t) (batch.n_tokens - 1) * n_embd_dec, + pending_g_last[seq_id].data(), row_bytes); + } + } + + for (int32_t k = beg; k < end; ++k) { + common_batch_add(batch, batch_in.token[k + 1], batch_in.pos[k], { seq_id }, /*logits=*/ false); + std::memcpy(batch.embd + (size_t) (batch.n_tokens - 1) * n_embd_dec, + g_embd + (size_t) k * n_embd_dec, row_bytes); + } + + // refresh deferred state + const int32_t n_rows = end - beg + 1; + verify_pos_first[seq_id] = batch_in.pos[beg]; + pending_pos_last[seq_id] = batch_in.pos[end]; + verify_g_rows[seq_id] = n_rows; + verify_g[seq_id].resize((size_t) n_rows * n_embd_dec, 0.0f); + std::memcpy(verify_g[seq_id].data(), g_embd + (size_t) beg * n_embd_dec, row_bytes * n_rows); + std::memcpy(pending_g_last[seq_id].data(), g_embd + (size_t) end * n_embd_dec, row_bytes); + } + + if (batch.n_tokens > 0) { + const int32_t rc = llama_decode(ctx_dft, batch); + if (rc != 0) { + LOG_ERR("%s: llama_decode(ctx_dft) failed rc=%d (n_tokens=%d, ubatch_pos[0]=%d)\n", + __func__, rc, (int) batch.n_tokens, (int) batch_in.pos[0]); + return false; + } + } + return true; } - void draft(common_speculative_draft_params_vec & /*dparams*/) override { - // TODO: implement + void draft(common_speculative_draft_params_vec & dparams) override { + auto & ctx_dft = params.ctx_dft; + + common_batch_clear(batch); + + // keep track of which sequences are still drafting + int n_drafting = 0; + std::vector drafting(n_seq); + + const size_t row_bytes = (size_t) n_embd_dec * sizeof(float); + + // Complete the deferred boundary pair (dp.id_last, pending_g_last) at memory + // pos pending_pos_last. dp.id_last is target's freshest sample (= corrected + // token after verify, or first generated token after prefill), matching the + // EAGLE3 input convention (token[P+1], g_embd[P]) at pos P. + for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) { + auto & dp = dparams[seq_id]; + + if (!dp.drafting) { + continue; + } + if (pending_pos_last[seq_id] < 0) { + continue; + } + + n_drafting++; + drafting[seq_id] = true; + common_sampler_reset(smpls[seq_id].get()); + + llama_memory_seq_rm(llama_get_memory(ctx_dft), seq_id, pending_pos_last[seq_id], -1); + + common_batch_add(batch, dp.id_last, pending_pos_last[seq_id], { seq_id }, true); + std::memcpy(batch.embd + (size_t) (batch.n_tokens - 1) * n_embd_dec, + pending_g_last[seq_id].data(), + row_bytes); + } + + if (batch.n_tokens == 0) { + return; + } + + int ret = llama_decode(ctx_dft, batch); + if (ret != 0) { + LOG_WRN("%s: llama_decode returned %d\n", __func__, ret); + return; + } + + int i = 0; + + while (n_drafting > 0) { + int i_batch = 0; + + common_batch_clear(batch); + + for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) { + if (!drafting[seq_id]) { + continue; + } + + auto * smpl = smpls[seq_id].get(); + + common_sampler_sample(smpl, ctx_dft, i_batch, true); + // pre-norm hidden state of this position becomes g_embd for the next step + const float * prenorm = llama_get_embeddings_nextn_ith(ctx_dft, i_batch); + ++i_batch; + + const auto * cur_p = common_sampler_get_candidates(smpl, true); + + for (int k = 0; k < std::min(3, (int) cur_p->size); ++k) { + LOG_DBG(" - seq_id %d, draft candidate %3d, pos %3d: %6d (%8.3f) '%s'\n", + seq_id, k, i, cur_p->data[k].id, cur_p->data[k].p, + common_token_to_piece(ctx_dft, cur_p->data[k].id).c_str()); + } + + const llama_token id = cur_p->data[0].id; + + // only collect very high-confidence draft tokens + // (configurable via --spec-draft-p-min, set to 0.0 to disable early-stop) + if (cur_p->data[0].p < params.p_min) { + drafting[seq_id] = false; + n_drafting--; + + continue; + } + + common_sampler_accept(smpl, id, true); + + auto & dp = dparams.at(seq_id); + auto & result = *dp.result; + + result.push_back(id); + + if (params.n_max <= (int) result.size()) { + drafting[seq_id] = false; + n_drafting--; + continue; + } + + common_batch_add(batch, id, pending_pos_last[seq_id] + (i + 1), { seq_id }, true); + std::memcpy(batch.embd + (size_t) (batch.n_tokens - 1) * n_embd_dec, prenorm, row_bytes); + } + + if (batch.n_tokens == 0) { + break; + } + + ret = llama_decode(ctx_dft, batch); + if (ret != 0) { + LOG_WRN("%s: llama_decode[%d] returned %d\n", __func__, i, ret); + break; + } + + ++i; + } + + for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) { + auto & dp = dparams[seq_id]; + if (!dp.drafting) { + continue; + } + + if (dp.result->size() < (size_t) params.n_min) { + dp.result->clear(); + } + } } - void accept(llama_seq_id /*seq_id*/, uint16_t /*n_accepted*/, bool /*is_other*/) override { - // noop + void accept(llama_seq_id seq_id, uint16_t n_accepted, bool /*is_other*/) override { + if (seq_id < 0 || seq_id >= (llama_seq_id) n_seq) { + return; + } + + const int32_t n_rows = verify_g_rows[seq_id]; + if (n_rows <= 0) { + return; + } + + const int32_t i_g = std::min(n_accepted, n_rows - 1); + pending_pos_last[seq_id] = verify_pos_first[seq_id] + i_g; + std::memcpy(pending_g_last[seq_id].data(), + verify_g[seq_id].data() + (size_t) i_g * n_embd_dec, + (size_t) n_embd_dec * sizeof(float)); + } + + // we only need to stash the deferred boundary's g_embd row for recurrent/hybrid targets: + // their single-position checkpoints drop it on restore + bool need_boundary_stash() const { + const llama_model * model_tgt = llama_get_model(params.ctx_tgt); + return llama_model_is_recurrent(model_tgt) || llama_model_is_hybrid(model_tgt); + } + + bool get_state(llama_seq_id seq_id, std::vector & data) const override { + if (!need_boundary_stash()) { + return false; + } + if (seq_id < 0 || seq_id >= (llama_seq_id) n_seq || pending_pos_last[seq_id] < 0) { + return false; + } + + const llama_pos pos = pending_pos_last[seq_id]; + const std::vector & g = pending_g_last[seq_id]; + + data.resize(sizeof(llama_pos) + g.size() * sizeof(float)); + std::memcpy(data.data(), &pos, sizeof(llama_pos)); + std::memcpy(data.data() + sizeof(llama_pos), g.data(), g.size() * sizeof(float)); + return true; + } + + void set_state(llama_seq_id seq_id, const std::vector & data) override { + if (!need_boundary_stash()) { + return; + } + if (seq_id < 0 || seq_id >= (llama_seq_id) n_seq) { + return; + } + if (data.size() != sizeof(llama_pos) + (size_t) n_embd_dec * sizeof(float)) { + return; + } + + llama_pos pos = -1; + std::memcpy(&pos, data.data(), sizeof(llama_pos)); + + pending_pos_last[seq_id] = pos; + pending_g_last[seq_id].resize(n_embd_dec); + std::memcpy(pending_g_last[seq_id].data(), data.data() + sizeof(llama_pos), (size_t) n_embd_dec * sizeof(float)); } bool need_embd() const override { @@ -419,7 +905,13 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl { int32_t n_embd = 0; - bool is_mem_shared = false; + // One MTP draft driver, three modes (set once in the ctor): + // is_mem_shared (gemma4): shares the target KV, runs all heads in one graph. + // chain_heads (step35): n_mtp_layers trained heads, one per draft step. + // neither (qwen35 / qwen35moe): a single trained MTP head. + int32_t n_mtp_layers = 1; + bool is_mem_shared = false; // gemma4 + bool chain_heads = false; // derived in the ctor: n_mtp_layers > 1 && !is_mem_shared // Per-sequence cross-batch carryover: pair (h_p, x_{p+1}) at MTP pos p+1. // The last h-row of one process() call needs the first token of the NEXT @@ -434,10 +926,8 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl { std::vector> verify_h; std::vector verify_h_rows; - // Per-seq draft length from the last draft() call, used in accept() to - // roll back ctx_dft's recurrent state past the AR draft's redundant - // pre-advancement before process() mirrored the verify batch. - std::vector last_n_drafted; + std::vector i_last; + std::vector> chain_h; common_speculative_impl_draft_mtp(const common_params_speculative & params, uint32_t n_seq) : common_speculative_impl(COMMON_SPECULATIVE_TYPE_DRAFT_MTP, n_seq) @@ -450,6 +940,7 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl { n_embd = llama_model_n_embd_out(llama_get_model(ctx_dft)); GGML_ASSERT(n_embd == llama_model_n_embd(llama_get_model(ctx_tgt)) && "MTP input row width must match the target h_nextn width"); + n_mtp_layers = std::max(1, (int) llama_model_n_layer_nextn(llama_get_model(ctx_dft))); LOG_INF("%s: adding speculative implementation 'draft-mtp'\n", __func__); LOG_INF("%s: - n_max=%d, n_min=%d, p_min=%.2f, n_embd=%d, backend_sampling=%d\n", __func__, this->params.n_max, this->params.n_min, this->params.p_min, n_embd, (int) this->params.backend_sampling); @@ -496,16 +987,25 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl { llama_set_embeddings_nextn(ctx_dft, true, /*masked*/ true); is_mem_shared = llama_get_ctx_other(ctx_dft) == ctx_tgt; + chain_heads = n_mtp_layers > 1 && !is_mem_shared; + + if (chain_heads) { + this->params.n_max = std::min(this->params.n_max, n_mtp_layers); + + chain_h.assign(n_seq, {}); + for (auto & c : chain_h) { + c.reserve((size_t) (this->params.n_max + 1) * n_embd); + } + } pending_h.assign(n_seq, std::vector(n_embd, 0.0f)); + i_last.assign(n_seq, -1); i_batch_beg.assign(n_seq, -1); i_batch_end.assign(n_seq, -1); verify_h.assign(n_seq, {}); verify_h_rows.assign(n_seq, 0); - - last_n_drafted.assign(n_seq, 0); } ~common_speculative_impl_draft_mtp() override { @@ -611,9 +1111,34 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl { set_h(i_batch_beg[seq_id], pending_h[seq_id].data()); } - const int32_t rc = llama_decode(ctx_dft, batch); - if (rc != 0) { - LOG_ERR("%s: llama_decode(ctx_dft) failed rc=%d (pos=%d)\n", __func__, (int) rc, (int) batch_in.pos[0]); + auto * mem_dft = llama_get_memory(ctx_dft); + + bool ok = true; + for (int head = 0; head < n_mtp_layers; ++head) { + if (chain_heads) { + // ref: https://github.com/ggml-org/llama.cpp/pull/24340/changes#r3413498544 + for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) { + if (i_batch_beg[seq_id] < 0) { + continue; + } + llama_memory_seq_rm(mem_dft, seq_id, batch_in.pos[i_batch_beg[seq_id]], -1); + } + llama_set_nextn_layer_offset(ctx_dft, head); + } + + const int32_t rc = llama_decode(ctx_dft, batch); + if (rc != 0) { + LOG_ERR("%s: llama_decode(ctx_dft) head=%d failed rc=%d (pos=%d)\n", + __func__, head, (int) rc, (int) batch_in.pos[0]); + ok = false; + break; + } + } + + if (chain_heads) { + llama_set_nextn_layer_offset(ctx_dft, 0); // restore default for non-draft decodes + } + if (!ok) { return false; } } @@ -648,7 +1173,6 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl { int n_drafting = 0; std::vector drafting(n_seq); - const float * h_row = nullptr; const size_t row_bytes = (size_t) n_embd * sizeof(float); for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) { @@ -663,22 +1187,43 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl { common_sampler_reset(smpls[seq_id].get()); common_batch_add(batch, dp.id_last, dp.n_past, { seq_id }, true); + std::memcpy(batch.embd + (size_t) (batch.n_tokens - 1) * n_embd, pending_h[seq_id].data(), row_bytes); - h_row = pending_h[seq_id].data(); - std::memcpy(batch.embd + n_embd*(batch.n_tokens - 1), h_row, row_bytes); - } + i_last[seq_id] = batch.n_tokens - 1; - int ret = llama_decode(ctx_dft, batch); - if (ret != 0) { - LOG_WRN("%s: llama_decode returned %d\n", __func__, ret); - return; + if (chain_heads) { + chain_h[seq_id].assign(pending_h[seq_id].begin(), pending_h[seq_id].end()); + } } int i = 0; while (n_drafting > 0) { - int i_batch = 0; + // each step decodes under a different head, i.e. a different decoder layer, and + // KV is per layer. process() filled this layer's KV only for positions < n_past + // (prompt + accepted prefix) — nothing in the draft region yet. so reset the + // draft region (the seq_rm lower bound is n_past, leaving the prompt KV intact) + // and select head i so it rebuilds its own layer's KV there; decoding just the + // latest token would leave its attention reading cells only another head wrote. + if (chain_heads) { + auto * mem_dft = llama_get_memory(ctx_dft); + for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) { + if (drafting[seq_id]) { + llama_memory_seq_rm(mem_dft, seq_id, dparams[seq_id].n_past, -1); + } + } + llama_set_nextn_layer_offset(ctx_dft, i); + } + int ret = llama_decode(ctx_dft, batch); + if (ret != 0) { + LOG_WRN("%s: llama_decode[%d] returned %d\n", __func__, i, ret); + break; + } + + // rebuild the batch for the next step: the growing-KV paths re-add only the + // new token (the KV already holds the prefix), while chained heads re-add the + // whole prefix at the next head. dropped sequences are simply not re-added. common_batch_clear(batch); for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) { @@ -688,9 +1233,8 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl { auto * smpl = smpls[seq_id].get(); - common_sampler_sample(smpl, ctx_dft, i_batch, true); - h_row = llama_get_embeddings_nextn_ith(ctx_dft, i_batch); - ++i_batch; + common_sampler_sample(smpl, ctx_dft, i_last[seq_id], true); + const float * h_row = llama_get_embeddings_nextn_ith(ctx_dft, i_last[seq_id]); const auto * cur_p = common_sampler_get_candidates(smpl, true); @@ -724,30 +1268,41 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl { continue; } - if (is_mem_shared) { + if (chain_heads) { + // ref: https://github.com/ggml-org/llama.cpp/pull/24340#discussion_r3448031546 + chain_h[seq_id].insert(chain_h[seq_id].end(), h_row, h_row + n_embd); + + const int n_rows = (int) result.size() + 1; // id_last + tokens drafted so far + for (int t = 0; t < n_rows; ++t) { + const llama_token tok = (t == 0) ? dp.id_last : result[t - 1]; + common_batch_add(batch, tok, dp.n_past + t, { seq_id }, t == n_rows - 1); + std::memcpy(batch.embd + (size_t) (batch.n_tokens - 1) * n_embd, + chain_h[seq_id].data() + (size_t) t * n_embd, row_bytes); + } + } else if (is_mem_shared) { // note: with shared memory (e.g. Gemma4 assistants) we use the same position for all draft tokens // ref: https://github.com/huggingface/transformers/blob/effde20942e3f82a1b97449f60b3a48c5ff96145/docs/source/en/model_doc/gemma4_assistant.md?plain=1#L36-L37 common_batch_add(batch, id, dp.n_past, { seq_id }, true); + std::memcpy(batch.embd + (size_t) (batch.n_tokens - 1) * n_embd, h_row, row_bytes); } else { common_batch_add(batch, id, dp.n_past + i + 1, { seq_id }, true); + std::memcpy(batch.embd + (size_t) (batch.n_tokens - 1) * n_embd, h_row, row_bytes); } - std::memcpy(batch.embd + n_embd*(batch.n_tokens - 1), h_row, row_bytes); - } - if (batch.n_tokens == 0) { - break; + i_last[seq_id] = batch.n_tokens - 1; } - // evaluate the drafted tokens on the draft model - ret = llama_decode(ctx_dft, batch); - if (ret != 0) { - LOG_WRN("%s: llama_decode[%d] returned %d\n", __func__, i, ret); + if (batch.n_tokens == 0) { break; } ++i; } + if (chain_heads) { + llama_set_nextn_layer_offset(ctx_dft, 0); // restore default for non-draft decodes + } + for (llama_seq_id seq_id = 0; seq_id < (llama_seq_id) n_seq; ++seq_id) { auto & dp = dparams[seq_id]; if (!dp.drafting) { @@ -757,8 +1312,6 @@ struct common_speculative_impl_draft_mtp : public common_speculative_impl { if (dp.result->size() < (size_t) params.n_min) { dp.result->clear(); } - - last_n_drafted[seq_id] = (uint16_t) dp.result->size(); } } @@ -1370,8 +1923,10 @@ common_speculative * common_speculative_init(common_params_speculative & params, uint32_t enabled_configs = common_get_enabled_speculative_configs(params.types); bool has_draft_simple = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE)); - bool has_draft_eagle3 = false; // TODO PR-18039: if params.speculative.eagle3 - bool has_mtp = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_MTP)) && params.draft.ctx_dft != nullptr; + bool has_draft_eagle3 = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3)) && params.draft.ctx_dft != nullptr; + bool has_draft_mtp = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_DRAFT_MTP)) && params.draft.ctx_dft != nullptr; + + bool has_ngram_cache = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_NGRAM_CACHE)); bool has_ngram_simple = (enabled_configs & (1u << COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE)); @@ -1407,7 +1962,7 @@ common_speculative * common_speculative_init(common_params_speculative & params, if (has_draft_eagle3) { configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3, params)); } - if (has_mtp) { + if (has_draft_mtp) { configs.push_back(common_speculative_config(COMMON_SPECULATIVE_TYPE_DRAFT_MTP, params)); } } @@ -1651,6 +2206,15 @@ void common_speculative_accept(common_speculative * spec, llama_seq_id seq_id, u { common_time_meas tm(impl->t_accept_us, !impl->gen_perf); + + if (impl->n_acc_tokens_per_pos.size() < n_accepted) { + impl->n_acc_tokens_per_pos.resize(n_accepted, 0); + } + + for (size_t i = 0; i < n_accepted; ++i) { + impl->n_acc_tokens_per_pos[i]++; + } + if (n_accepted > 0) { impl->n_acc_drafts++; impl->n_acc_tokens += n_accepted; @@ -1668,6 +2232,31 @@ void common_speculative_accept(common_speculative * spec, llama_seq_id seq_id, u } } +// TODO: support the case of more than one speculative implementations having a state +bool common_speculative_get_state(common_speculative * spec, llama_seq_id seq_id, std::vector & data) { + if (spec == nullptr) { + return false; + } + + for (auto & impl : spec->impls) { + if (impl->get_state(seq_id, data)) { + return true; + } + } + + return false; +} + +void common_speculative_set_state(common_speculative * spec, llama_seq_id seq_id, const std::vector & data) { + if (spec == nullptr) { + return; + } + + for (auto & impl : spec->impls) { + impl->set_state(seq_id, data); + } +} + void common_speculative_print_stats(const common_speculative * spec) { if (spec == nullptr) { return; @@ -1685,13 +2274,31 @@ void common_speculative_print_stats(const common_speculative * spec) { str_perf = ""; } - LOG_INF("statistics %16s: #calls(b,g,a) = %4zu %6zu %6zu, #gen drafts = %6zu, #acc drafts = %5zu, #gen tokens = %6zu, #acc tokens = %5zu%s\n", + std::string str_stats; + if (impl->n_call_accept > 0) { + const double mean = + 1.0 + (double) impl->n_acc_tokens / (double) impl->n_call_accept; + std::ostringstream tmp; + tmp << std::fixed << std::setprecision(3); + for (size_t i = 0; i < impl->n_acc_tokens_per_pos.size(); ++i) { + if (i > 0) { + tmp << ", "; + } + tmp << (double) impl->n_acc_tokens_per_pos[i] / (double) impl->n_call_accept; + } + std::ostringstream oss; + oss << std::fixed << std::setprecision(2) << mean; + str_stats = ", #mean acc len = " + oss.str() + ", #acc rate/pos = (" + tmp.str() + ")"; + } + + LOG_INF("statistics %16s: #calls(b,g,a) = %4zu %6zu %6zu, #gen drafts = %6zu, #acc drafts = %5zu, #gen tokens = %6zu, #acc tokens = %5zu%s%s\n", common_speculative_type_to_str(impl->type).c_str(), impl->n_call_begin, impl->n_call_draft, impl->n_call_accept, impl->n_gen_drafts, impl->n_acc_drafts, impl->n_gen_tokens, impl->n_acc_tokens, + str_stats.c_str(), str_perf.c_str()); } } diff --git a/common/speculative.h b/common/speculative.h index bf76ad709e26..c58fac3cc6d0 100644 --- a/common/speculative.h +++ b/common/speculative.h @@ -68,6 +68,10 @@ void common_speculative_draft(common_speculative * spec); // informs the speculative context that n_accepted tokens were accepted by the target model void common_speculative_accept(common_speculative * spec, llama_seq_id, uint16_t n_accepted); +// (optional) get/set internal state +bool common_speculative_get_state(common_speculative * spec, llama_seq_id seq_id, std::vector & data); +void common_speculative_set_state(common_speculative * spec, llama_seq_id seq_id, const std::vector & data); + // print statistics about the speculative decoding void common_speculative_print_stats(const common_speculative * spec); diff --git a/conversion/__init__.py b/conversion/__init__.py index 18162976f458..5aad203e53c3 100644 --- a/conversion/__init__.py +++ b/conversion/__init__.py @@ -40,11 +40,13 @@ "ChatGLMModel": "chatglm", "CodeShellForCausalLM": "codeshell", "CogVLMForCausalLM": "cogvlm", + "Cohere2MoeForCausalLM": "command_r", "Cohere2ForCausalLM": "command_r", "CohereForCausalLM": "command_r", "DbrxForCausalLM": "dbrx", "DeciLMForCausalLM": "deci", "DeepseekForCausalLM": "deepseek", + "DeepseekOCRForCausalLM": "deepseek", "DeepseekV2ForCausalLM": "deepseek", "DeepseekV3ForCausalLM": "deepseek", "DeepseekV32ForCausalLM": "deepseek", @@ -95,6 +97,7 @@ "GraniteMoeHybridForCausalLM": "granite", "GraniteMoeSharedForCausalLM": "granite", "GraniteSpeechForConditionalGeneration": "granite", + "GraniteSpeechPlusForConditionalGeneration": "granite", "Grok1ForCausalLM": "grok", "GrokForCausalLM": "grok", "GroveMoeForCausalLM": "grovemoe", @@ -122,6 +125,7 @@ "LLaDAModelLM": "llada", "LLaMAForCausalLM": "llama", "Lfm25AudioTokenizer": "lfm2", + "Lfm2BidirectionalModel": "lfm2", "Lfm2ForCausalLM": "lfm2", "Lfm2Model": "lfm2", "Lfm2MoeForCausalLM": "lfm2", @@ -130,6 +134,10 @@ "LlamaBidirectionalModel": "llama", "LlamaForCausalLM": "llama", "LlamaModel": "llama", + "Eagle3DraftModel": "llama", + "Eagle3Speculator": "llama", + "Eagle3LlamaForCausalLM": "llama", + "LlamaForCausalLMEagle3": "llama", "LlavaForConditionalGeneration": "llama", "LlavaStableLMEpochForCausalLM": "stablelm", "MPTForCausalLM": "mpt", @@ -227,6 +235,7 @@ "UMT5ForConditionalGeneration": "t5", "UMT5Model": "t5", "UltravoxModel": "ultravox", + "UnlimitedOCRForCausalLM": "deepseek", "VLlama3ForCausalLM": "llama", "VoxtralForConditionalGeneration": "llama", "WavTokenizerDec": "wavtokenizer", @@ -257,6 +266,7 @@ "GlmasrModel": "ultravox", "Granite4VisionForConditionalGeneration": "granite", "GraniteSpeechForConditionalGeneration": "granite", + "GraniteSpeechPlusForConditionalGeneration": "granite", "HunYuanVLForConditionalGeneration": "hunyuan", "Idefics3ForConditionalGeneration": "smolvlm", "InternVisionModel": "internvl", @@ -292,6 +302,7 @@ "StepVLForConditionalGeneration": "step3", "Step3p7ForConditionalGeneration": "step3", "UltravoxModel": "ultravox", + "UnlimitedOCRForCausalLM": "deepseek", "VoxtralForConditionalGeneration": "ultravox", "YoutuVLForConditionalGeneration": "youtuvl", } diff --git a/conversion/bailingmoe.py b/conversion/bailingmoe.py index 319ff6dabee2..2c6425cb6436 100644 --- a/conversion/bailingmoe.py +++ b/conversion/bailingmoe.py @@ -126,7 +126,7 @@ def set_gguf_parameters(self): if (rope_dim := hparams.get("head_dim")) is None: rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] - self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5))) + self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.rope_parameters.get("partial_rotary_factor", 0.5))) self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"]) self.gguf_writer.add_vocab_size(hparams["vocab_size"]) self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"]) diff --git a/conversion/base.py b/conversion/base.py index 408e209aa884..08fd3747c408 100644 --- a/conversion/base.py +++ b/conversion/base.py @@ -94,6 +94,7 @@ class ModelBase: metadata: gguf.Metadata dir_model_card: Path remote_hf_model_id: str | None + target_model_dir: Path | None # subclasses should define this! model_arch: gguf.MODEL_ARCH @@ -119,6 +120,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None, disable_mistral_community_chat_template: bool = False, sentence_transformers_dense_modules: bool = False, + target_model_dir: Path | None = None, fuse_gate_up_exps: bool = False, fp8_as_q8: bool = False): if type(self) is ModelBase or \ @@ -139,6 +141,7 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, self.dry_run = dry_run self.remote_hf_model_id = remote_hf_model_id self.sentence_transformers_dense_modules = sentence_transformers_dense_modules + self.target_model_dir = target_model_dir self.fuse_gate_up_exps = fuse_gate_up_exps self._gate_exp_buffer: dict[int, Tensor] = {} self._up_exp_buffer: dict[int, Tensor] = {} @@ -1116,8 +1119,10 @@ def __init__(self, *args, **kwargs): rope_theta = self.find_hparam(["global_rope_theta", "rope_global_theta", "rope_theta_global", "rope_theta", "rotary_emb_base"], optional=True) local_rope_theta = self.find_hparam(["local_rope_theta", "rope_local_theta", "rope_theta_local", "swa_rope_theta", "rope_local_base_freq"], optional=True) + partial_rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct", "rope_percent"], optional=True) + original_max_position_embeddings = self.find_hparam(["original_max_position_embeddings"], optional=True) - # Ensure "rope_theta" and "rope_type" is mirrored in rope_parameters + # Ensure global params are mirrored in rope_parameters if "full_attention" not in self.rope_parameters and "sliding_attention" not in self.rope_parameters: if local_rope_theta is not None: self.rope_parameters["sliding_attention"] = {"rope_theta": local_rope_theta} @@ -1125,6 +1130,10 @@ def __init__(self, *args, **kwargs): self.rope_parameters["rope_theta"] = rope_theta if "rope_type" not in self.rope_parameters and (rope_type := self.rope_parameters.get("type")) is not None: self.rope_parameters["rope_type"] = rope_type + if "partial_rotary_factor" not in self.rope_parameters and partial_rotary_factor is not None: + self.rope_parameters["partial_rotary_factor"] = partial_rotary_factor + if "original_max_position_embeddings" not in self.rope_parameters and original_max_position_embeddings is not None: + self.rope_parameters["original_max_position_embeddings"] = original_max_position_embeddings @classmethod def __init_subclass__(cls): @@ -1192,7 +1201,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_embedding_length(n_embd) logger.info(f"gguf: embedding length = {n_embd}") - if (n_ff := self.find_hparam(["intermediate_size", "n_inner", "hidden_dim"], optional=True)) is not None: + if (n_ff := self.find_hparam(["prefix_dense_intermediate_size", "intermediate_size", "n_inner", "hidden_dim"], optional=True)) is not None: self.gguf_writer.add_feed_forward_length(n_ff) logger.info(f"gguf: feed forward length = {n_ff}") @@ -1277,7 +1286,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_expert_group_used_count(n_group_used) logger.info(f"gguf: expert groups used count = {n_group_used}") - if (score_func := self.find_hparam(["score_function", "scoring_func", "score_func", "moe_router_activation", "moe_router_activation_func"], optional=True)) is not None: + if (score_func := self.find_hparam(["score_function", "scoring_func", "score_func", "moe_router_activation", "moe_router_activation_func", "expert_selection_fn"], optional=True)) is not None: if score_func == "sigmoid": self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) elif score_func == "softmax": @@ -1492,6 +1501,9 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "d772b220ace2baec124bed8cfafce0ead7d6c38a4b65ef11261cf9d5d62246d1": # ref: https://huggingface.co/CohereLabs/tiny-aya-base res = "tiny_aya" + if chkhsh == "52df12b4c8d4176e7481aab4b6e8454d1fd0a210a04a574f6d4e067d10e23c3e": + # ref: https://huggingface.co/CohereLabs/North-Mini-Code-1.0 + res = "cohere2moe" if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea": # ref: https://huggingface.co/Qwen/Qwen1.5-7B res = "qwen2" @@ -2481,6 +2493,7 @@ class LazyTorchTensor(gguf.LazyBase): torch.float16: np.float16, torch.float32: np.float32, torch.uint8: np.uint8, + torch.int64: np.int64, } # only used when byteswapping data. Only correct size is needed diff --git a/conversion/chatglm.py b/conversion/chatglm.py index 7e323b890049..801913075dbc 100644 --- a/conversion/chatglm.py +++ b/conversion/chatglm.py @@ -148,7 +148,7 @@ def set_gguf_parameters(self): rope_dim = self.hparams["attention_dim"] else: rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] - self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5))) + self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.rope_parameters.get("partial_rotary_factor", 0.5))) self.gguf_writer.add_add_bos_token(False) rope_freq = 10000 if "rope_ratio" in self.hparams: diff --git a/conversion/command_r.py b/conversion/command_r.py index 603288d165c5..118565c66973 100644 --- a/conversion/command_r.py +++ b/conversion/command_r.py @@ -1,5 +1,6 @@ from __future__ import annotations +import re from typing import Iterable, TYPE_CHECKING import torch @@ -55,3 +56,122 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return yield from super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("Cohere2MoeForCausalLM") +class Cohere2MoeModel(TextModel): + model_arch = gguf.MODEL_ARCH.COHERE2MOE + _n_main_layers: int | None = None + _expert_tensor_re = re.compile( + r"model\.layers\.(\d+)\.mlp\.experts\.(\d+)\.(down_proj|gate_proj|up_proj)\.weight" + ) + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + if (n_nextn := int(self.hparams.get("num_nextn_predict_layers", 0) or 0)) > 0 and not self.no_mtp: + self.block_count += n_nextn + self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) + self._experts: list[dict[str, Tensor]] = [{} for _ in range(self.block_count)] + + def _set_vocab_gpt2(self) -> None: + tokens, toktypes, tokpre = self.get_vocab_base() + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + special_vocab.add_to_gguf(self.gguf_writer) + + def set_gguf_parameters(self): + hparams = self.hparams + expert_intermediate_size = hparams["intermediate_size"] + mlp_layer_types = hparams.get("mlp_layer_types") + n_dense_lead = hparams.get("first_k_dense_replace", 0) + if mlp_layer_types is not None: + n_dense_lead = next((i for i, t in enumerate(mlp_layer_types) if t != "dense"), len(mlp_layer_types)) + + super().set_gguf_parameters() + + self.gguf_writer.add_logit_scale(hparams["logit_scale"]) + self.gguf_writer.add_sliding_window(hparams["sliding_window"]) + self.gguf_writer.add_sliding_window_pattern([t == "sliding_attention" for t in hparams["layer_types"]]) + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + self.gguf_writer.add_expert_feed_forward_length(expert_intermediate_size) + self.gguf_writer.add_leading_dense_block_count(n_dense_lead) + self.gguf_writer.add_expert_weights_norm(hparams.get("norm_topk_prob", False)) + if (num_shared_experts := hparams.get("num_shared_experts", 0)) > 0: + if hparams.get("shared_expert_combination_strategy", "average") != "average": + raise ValueError("Cohere2 MoE only supports average shared expert combination") + self.gguf_writer.add_expert_shared_count(num_shared_experts) + self.gguf_writer.add_expert_shared_feed_forward_length(expert_intermediate_size * num_shared_experts) + if (n_nextn := hparams.get("num_nextn_predict_layers", 0)) > 0 and not self.no_mtp: + self.gguf_writer.add_nextn_predict_layers(n_nextn) + self.gguf_writer.add_rope_dimension_count(hparams["head_dim"]) + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) + + def index_tensors(self, remote_hf_model_id: str | None = None): + hparams = {**self.hparams, **self.hparams.get("text_config", {})} + self._n_main_layers = hparams.get("num_hidden_layers") + type(self)._n_main_layers = self._n_main_layers + return super().index_tensors(remote_hf_model_id=remote_hf_model_id) + + @classmethod + def filter_tensors(cls, item): + if (titem := super().filter_tensors(item)) is None: + return None + name, gen = titem + + if cls._n_main_layers is not None: + is_mtp = (m := re.match(r"model\.layers\.(\d+)\.", name)) is not None and int(m.group(1)) >= cls._n_main_layers + if is_mtp and cls.no_mtp: + return None + if cls.mtp_only and not is_mtp and name not in ( + "model.embed_tokens.weight", "model.norm.weight", "lm_head.weight", + ): + return None + + return name, gen + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if name.endswith(".bias"): + if torch.any(data_torch != 0): + raise ValueError(f"Bias tensor {name!r} is not zero.") + logger.debug(f"Skipping bias tensor {name!r}.") + return + + if (m := self._expert_tensor_re.fullmatch(name)) is not None: + n_experts = self.hparams["num_experts"] + layer_idx = int(m.group(1)) + assert bid is None or bid == layer_idx + + self._experts[layer_idx][name] = data_torch + + expected = { + f"model.layers.{layer_idx}.mlp.experts.{xid}.{w_name}.weight" + for xid in range(n_experts) + for w_name in ("down_proj", "gate_proj", "up_proj") + } + if expected.issubset(self._experts[layer_idx]): + for w_name in ["down_proj", "gate_proj", "up_proj"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{layer_idx}.mlp.experts.{xid}.{w_name}.weight" + datas.append(self._experts[layer_idx][ename]) + del self._experts[layer_idx][ename] + + data_torch = torch.stack(datas, dim=0) + merged_name = f"model.layers.{layer_idx}.mlp.experts.{w_name}.weight" + + yield from super().modify_tensors(data_torch, merged_name, layer_idx) + return + + yield from super().modify_tensors(data_torch, name, bid) + + def prepare_tensors(self): + super().prepare_tensors() + + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") diff --git a/conversion/deci.py b/conversion/deci.py index 46d8568c5a40..be446eefa637 100644 --- a/conversion/deci.py +++ b/conversion/deci.py @@ -161,7 +161,7 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: factor = rope_params.get("factor", 8.0) low_freq_factor = rope_params.get("low_freq_factor", 1.0) high_freq_factor = rope_params.get("high_freq_factor", 4.0) - old_context_len = self.hparams.get("original_max_position_embeddings", 8192) + old_context_len = rope_params.get("original_max_position_embeddings", 8192) low_freq_wavelen = old_context_len / low_freq_factor high_freq_wavelen = old_context_len / high_freq_factor diff --git a/conversion/deepseek.py b/conversion/deepseek.py index 72520cc9f6a5..4c93fb66df64 100644 --- a/conversion/deepseek.py +++ b/conversion/deepseek.py @@ -14,7 +14,7 @@ from .qwen import QwenModel -@ModelBase.register("DeepseekOCRForCausalLM") +@ModelBase.register("DeepseekOCRForCausalLM", "UnlimitedOCRForCausalLM") class DeepseekOCRVisionModel(MmprojModel): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -205,6 +205,8 @@ def prepare_tensors(self): @ModelBase.register( "DeepseekV2ForCausalLM", "DeepseekV3ForCausalLM", + "DeepseekOCRForCausalLM", + "UnlimitedOCRForCausalLM", "KimiVLForConditionalGeneration", "KimiK25ForConditionalGeneration", "YoutuForCausalLM", @@ -224,7 +226,7 @@ def __init__(self, *args, **kwargs): self.origin_hf_arch = hparams.get('architectures', [None])[0] # special handling for Deepseek OCR - if self.origin_hf_arch in ("DeepseekOCRForCausalLM", "DeepseekOCR2ForCausalLM"): + if self.origin_hf_arch in ("DeepseekOCRForCausalLM", "DeepseekOCR2ForCausalLM", "UnlimitedOCRForCausalLM"): self.model_arch = gguf.MODEL_ARCH.DEEPSEEK2OCR self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch] self.gguf_writer.add_architecture() @@ -350,6 +352,12 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) + # Unlimited-OCR sliding window; written for metadata, the decoder ignores it (full MHA) + if is_ocr: + sliding_window = hparams.get("sliding_window_size") or hparams.get("sliding_window") + if sliding_window: + self.gguf_writer.add_sliding_window(sliding_window) + if (rope_mscale_all := self.rope_parameters.get("mscale_all_dim")) is not None: # [TAG_DEEPSEEK2_YARN_LOG_MUL_FIX] # note: for legacy reasons, this is not consistent with the other usages of self.gguf_writer.add_rope_scaling_yarn_log_mul diff --git a/conversion/exaone.py b/conversion/exaone.py index b21f02784292..bc4fb3f1b171 100644 --- a/conversion/exaone.py +++ b/conversion/exaone.py @@ -24,7 +24,7 @@ def set_gguf_parameters(self): assert (hparams["activation_function"] == "silu") - rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"], optional=True) + rotary_factor = self.rope_parameters.get("partial_rotary_factor") rotary_factor = rotary_factor if rotary_factor is not None else 1.0 self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"]))) @@ -39,7 +39,7 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: factor = rope_params.get("factor", 8.0) low_freq_factor = rope_params.get("low_freq_factor", 1.0) high_freq_factor = rope_params.get("high_freq_factor", 4.0) - old_context_len = self.hparams.get("original_max_position_embeddings", 8192) + old_context_len = rope_params.get("original_max_position_embeddings", 8192) low_freq_wavelen = old_context_len / low_freq_factor high_freq_wavelen = old_context_len / high_freq_factor @@ -104,7 +104,7 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: factor = rope_params.get("factor", 16.0) low_freq_factor = rope_params.get("low_freq_factor", 1.0) high_freq_factor = rope_params.get("high_freq_factor", 4.0) - old_context_len = self.hparams.get("original_max_position_embeddings", 8192) + old_context_len = rope_params.get("original_max_position_embeddings", 8192) low_freq_wavelen = old_context_len / low_freq_factor high_freq_wavelen = old_context_len / high_freq_factor diff --git a/conversion/gemma.py b/conversion/gemma.py index 5b4ca5c583df..c552df732b0f 100644 --- a/conversion/gemma.py +++ b/conversion/gemma.py @@ -693,7 +693,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_head_count_kv(value_arr) # handle n_rot differently for global vs swa layers - partial_rotary_factor_swa = self.hparams.get("partial_rotary_factor", 1.0) + partial_rotary_factor_swa = self.rope_parameters.get("partial_rotary_factor", 1.0) n_rot_full = int(head_dim_full) # "proportional" is used, see generate_extra_tensors n_rot_swa = int(head_dim_swa * partial_rotary_factor_swa) self.gguf_writer.add_rope_dimension_count(n_rot_full) diff --git a/conversion/glm.py b/conversion/glm.py index 641937720d67..895cefc22b89 100644 --- a/conversion/glm.py +++ b/conversion/glm.py @@ -124,7 +124,7 @@ def set_gguf_parameters(self): self.hparams["hidden_size"] // self.hparams["num_attention_heads"] ) self.gguf_writer.add_rope_dimension_count( - int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)) + int(rope_dim * self.rope_parameters.get("partial_rotary_factor", 0.5)) ) # MoE parameters - Use only routed expert count (shared experts handled separately) @@ -226,7 +226,7 @@ def set_gguf_parameters(self): super().set_gguf_parameters() rope_dim = self.hparams["qk_rope_head_dim"] - partial_rotary_factor = self.hparams.get("partial_rotary_factor", 1.0) + partial_rotary_factor = self.rope_parameters.get("partial_rotary_factor", 1.0) self.gguf_writer.add_rope_dimension_count(int(rope_dim * partial_rotary_factor)) # NextN/MTP prediction layers diff --git a/conversion/granite.py b/conversion/granite.py index 53441fe57016..8367ed225da6 100644 --- a/conversion/granite.py +++ b/conversion/granite.py @@ -348,6 +348,34 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter yield from super().modify_tensors(data_torch, name, bid) +@ModelBase.register("GraniteSpeechPlusForConditionalGeneration") +class GraniteSpeechPlusMmprojModel(GraniteSpeechMmprojModel): + """Conversion for GraniteSpeechPlus - extends GraniteSpeech with feature layer concatenation""" + has_vision_encoder = False + has_audio_encoder = True + + def set_gguf_parameters(self): + assert self.hparams_audio is not None + super().set_gguf_parameters() + + # Add feature_layer if present in encoder config + if feature_layers := self.hparams_audio.get("cat_hidden_layers"): + self.gguf_writer.add_audio_feature_layers(feature_layers) + logger.info(f"gguf: audio feature_layers = {feature_layers}") + + # Validate projector dimension matches concatenated encoder output + hidden_dim = self.hparams_audio["hidden_dim"] + expected_dim = hidden_dim * (len(feature_layers) + 1) + projector_dim = self.global_config["projector_config"]["encoder_hidden_size"] + + if projector_dim != expected_dim: + raise ValueError( + f"Projector encoder_hidden_size ({projector_dim}) does not match " + f"expected concatenated dimension ({expected_dim}). " + f"Expected: hidden_dim ({hidden_dim}) * (len(feature_layers) + 1) = {expected_dim}" + ) + + @ModelBase.register("Granite4VisionForConditionalGeneration") class Granite4VisionMmprojModel(MmprojModel): has_vision_encoder = True diff --git a/conversion/lfm2.py b/conversion/lfm2.py index f28fccf10f2c..70ce45658be5 100644 --- a/conversion/lfm2.py +++ b/conversion/lfm2.py @@ -64,11 +64,17 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter yield from super().modify_tensors(data_torch, name, bid) -@ModelBase.register("Lfm2Model") +@ModelBase.register("Lfm2Model", "Lfm2BidirectionalModel") class LFM2ColBertModel(LFM2Model): model_arch = gguf.MODEL_ARCH.LFM2 dense_tensor_name = "dense_2" + def set_gguf_parameters(self): + super().set_gguf_parameters() + if self.hf_arch == "Lfm2BidirectionalModel": + self.gguf_writer.add_causal_attention(False) + self._try_set_pooling_type() + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: if not name.startswith(self.dense_tensor_name): name = "model." + name @@ -76,10 +82,11 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter yield from super().modify_tensors(data_torch, name, bid) def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - # dense tensor is stored in a separate safetensors file + # optional dense tensor is stored in a separate safetensors file from safetensors.torch import load_file tensors_file = self.dir_model / "1_Dense" / "model.safetensors" - assert tensors_file.is_file() + if not tensors_file.is_file(): + return tensor = load_file(tensors_file)["linear.weight"] self.gguf_writer.add_embedding_length_out(tensor.shape[0]) yield f"{self.dense_tensor_name}.weight", tensor.clone() diff --git a/conversion/llama.py b/conversion/llama.py index fd6167bfd91f..b43cc994aa3a 100644 --- a/conversion/llama.py +++ b/conversion/llama.py @@ -5,12 +5,13 @@ from typing import Callable, Iterable, TYPE_CHECKING +import numpy as np import torch if TYPE_CHECKING: from torch import Tensor -from .base import ModelBase, TextModel, gguf +from .base import ModelBase, TextModel, gguf, logger @ModelBase.register( @@ -21,6 +22,10 @@ "VLlama3ForCausalLM", "LlavaForConditionalGeneration", "VoxtralForConditionalGeneration", + "LlamaForCausalLMEagle3", + "Eagle3LlamaForCausalLM", + "Eagle3Speculator", + "Eagle3DraftModel", "IQuestCoderForCausalLM", "LlamaModel") class LlamaModel(TextModel): @@ -39,7 +44,61 @@ def __init__(self, *args, **kwargs): hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False) self.origin_hf_arch = hparams.get('architectures', [None])[0] + # Detect eagle3 draft checkpoint by hparams (some models don't use a distinct HF arch name) + if "draft_vocab_size" in self.hparams and self.hparams["num_hidden_layers"] == 1: + self.is_eagle3 = True + self.model_arch = gguf.MODEL_ARCH.EAGLE3 + logger.info("Detected EAGLE-3 draft model, switching to EAGLE3 architecture") + # Re-initialize tensor_map with eagle3 architecture + self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) + # Update gguf_writer architecture + self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[self.model_arch] + self.gguf_writer.add_architecture() + if self.target_model_dir is None: + raise ValueError( + "EAGLE-3 model requires --target-model-dir to be specified. " + "Please provide the path to the target model directory to read config.json" + ) + # Read both eagle3 raw config and target model config + with open(self.dir_model / "config.json", 'r', encoding='utf-8') as f: + eagle3_raw_config = json.load(f) + with open(self.target_model_dir / "config.json", 'r', encoding='utf-8') as f: + target_config = json.load(f) + + if "text_config" in target_config: + target_config = {**target_config, **target_config["text_config"]} + self.target_vocab_size = target_config["vocab_size"] + + # target_layers: derived from target model layer count (low/mid/high) + target_num_layers = target_config["num_hidden_layers"] + target_layers = [2, target_num_layers // 2, target_num_layers - 3] + logger.info(f"EAGLE-3: target_layers = {target_layers} (target model has {target_num_layers} layers)") + self.gguf_writer.add_array(f"{self.gguf_writer.arch}.target_layers", target_layers) + + # target_hidden_size: prefer eagle3 config, fallback to target config + if eagle3_raw_config.get("target_hidden_size") is not None: + target_hidden_size = eagle3_raw_config["target_hidden_size"] + src = "EAGLE-3 config" + else: + target_hidden_size = target_config["hidden_size"] + src = "target model config" + logger.info(f"EAGLE-3: target_hidden_size = {target_hidden_size} (from {src})") + self.gguf_writer.add_uint32(f"{self.gguf_writer.arch}.target_hidden_size", target_hidden_size) + + # norm_before_residual (RedHat-style eagle3 specific) + norm_before_residual = eagle3_raw_config.get("norm_before_residual", False) + logger.info(f"EAGLE-3: norm_before_residual = {norm_before_residual}") + self.gguf_writer.add_bool(f"{self.gguf_writer.arch}.norm_before_residual", norm_before_residual) + def set_vocab(self): + # eagle3: use tokenizer from target model if provided + original_dir_model = None + if getattr(self, 'is_eagle3', False): + assert self.target_model_dir is not None + logger.info(f"EAGLE-3: Using tokenizer from target model: {self.target_model_dir}") + original_dir_model = self.dir_model + self.dir_model = self.target_model_dir + if self.origin_hf_arch == "GlmasrModel": return self._set_vocab_glmedge() @@ -85,6 +144,10 @@ def set_vocab(self): if self.hparams.get("vocab_size", 32000) == 49152: self.gguf_writer.add_add_bos_token(False) + # eagle3: Restore original dir_model + if original_dir_model is not None: + self.dir_model = original_dir_model + def set_gguf_parameters(self): super().set_gguf_parameters() hparams = self.hparams @@ -129,7 +192,49 @@ def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Ca return super().filter_tensors((name, gen)) + def index_tensors(self, remote_hf_model_id: str | None = None) -> dict[str, Callable[[], Tensor]]: + tensors = super().index_tensors(remote_hf_model_id) + + # Handle Eagle3Speculator nested config + if "transformer_layer_config" in self.hparams: + self.hparams = {**self.hparams, **self.hparams["transformer_layer_config"]} + + # eagle3 detection + if "draft_vocab_size" in self.hparams and self.hparams["num_hidden_layers"] == 1: + logger.info("EAGLE-3: renaming midlayer.* / layers.0.* to model.layers.0.*") + new_tensors = {} + for name, gen in tensors.items(): + if name.startswith("midlayer."): + new_name = "model.layers.0." + name[len("midlayer."):] + new_tensors[new_name] = gen + elif name.startswith("layers.0."): # Eagle3Speculator format + new_name = "model." + name + new_tensors[new_name] = gen + else: + new_tensors[name] = gen + return new_tensors + + return tensors + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # eagle3: special tensors that bypass standard llama mapping + if getattr(self, 'is_eagle3', False): + if name == "fc.weight": + yield (name, data_torch) + return + if name == "d2t": + # store for manual int64 handling in prepare_tensors (avoid F32 conversion) + if not hasattr(self, '_eagle3_int_tensors'): + self._eagle3_int_tensors = {} + self._eagle3_int_tensors[name] = data_torch + return + if name == "t2d": + # not used at runtime, skip + return + if name.endswith(".hidden_norm.weight"): + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_NORM_2, bid), data_torch) + return + n_head = self.find_hparam(["n_heads", "num_attention_heads"]) n_kv_head = self.find_hparam(["n_kv_heads", "num_key_value_heads"]) @@ -185,7 +290,7 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: factor = rope_params.get("factor", 8.0) low_freq_factor = rope_params.get("low_freq_factor", 1.0) high_freq_factor = rope_params.get("high_freq_factor", 4.0) - old_context_len = self.hparams.get("original_max_position_embeddings", 8192) + old_context_len = rope_params.get("original_max_position_embeddings", 8192) low_freq_wavelen = old_context_len / low_freq_factor high_freq_wavelen = old_context_len / high_freq_factor @@ -205,8 +310,33 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32)) def prepare_tensors(self): + # eagle3: collect d2t original dtype before parent converts tensors to F32 + eagle3_original_dtypes = {} + if getattr(self, 'is_eagle3', False): + for name, data_torch in self.get_tensors(): + if name == "d2t": + eagle3_original_dtypes[name] = data_torch.dtype + super().prepare_tensors() + # eagle3: write d2t as absolute target token ids + if getattr(self, 'is_eagle3', False) and hasattr(self, '_eagle3_int_tensors'): + for name, data_torch in self._eagle3_int_tensors.items(): + old_dtype = eagle3_original_dtypes.get(name, data_torch.dtype) + data = data_torch.to(torch.int64).cpu().numpy() + if name == "d2t": + data = data.reshape(-1) + data = data + np.arange(data.size, dtype=np.int64) + if np.any((data < 0) | (data >= self.target_vocab_size)): + raise ValueError(f"EAGLE-3 d2t target ids out of range for target vocab size {self.target_vocab_size}") + if np.unique(data).size != data.size: + raise ValueError("EAGLE-3 d2t contains duplicate target ids") + data_qtype = gguf.GGMLQuantizationType.I64 + + shape_str = f"{{{', '.join(str(n) for n in reversed(data.shape))}}}" + logger.info(f"{name + ',':<30} {old_dtype} --> {data_qtype.name}, shape = {shape_str}") + self.gguf_writer.add_tensor(name, data, raw_dtype=data_qtype) + if self._experts is not None: # flatten `list[dict[str, Tensor]]` into `list[str]` experts = [k for d in self._experts for k in d.keys()] diff --git a/conversion/mamba.py b/conversion/mamba.py index be0e36a29bad..43d559ffb0ae 100644 --- a/conversion/mamba.py +++ b/conversion/mamba.py @@ -114,7 +114,8 @@ def __init__(self, dir_model: Path, *args, **kwargs): hparams["text_config"] = hparams["llm_config"] super().__init__(dir_model, *args, hparams=hparams, **kwargs) self.d_model = self.find_hparam(["hidden_size", "d_model", "dim"]) - self.d_inner = self.find_hparam(["mamba_d_ssm", "intermediate_size", "d_inner"], optional=True) or 2 * self.d_model + self.expand = self.find_hparam(["mamba_expand", "expand"], optional=True) or 2 + self.d_inner = self.find_hparam(["mamba_d_ssm", "intermediate_size", "d_inner"], optional=True) or self.expand * self.d_model self.n_group = self.find_hparam(["n_groups"], optional=True) or 1 def set_vocab(self): @@ -144,11 +145,9 @@ def set_gguf_parameters(self): rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5 - # Fail early for models which don't have a block expansion factor of 2 - # TODO: does this really matter? # skip the assertion for FalconH1 Model if self.model_arch != gguf.MODEL_ARCH.FALCON_H1: - assert self.d_inner == 2 * self.d_model + assert self.d_inner == self.expand * self.d_model assert self.d_inner % head_dim == 0 self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default diff --git a/conversion/mimo.py b/conversion/mimo.py index d4067aab4b61..11ec2867940a 100644 --- a/conversion/mimo.py +++ b/conversion/mimo.py @@ -154,7 +154,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_expert_count(self.hparams["n_routed_experts"]) self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"]) - rope_dim = int(self.hparams["head_dim"] * self.hparams["partial_rotary_factor"]) + rope_dim = int(self.hparams["head_dim"] * self.rope_parameters["partial_rotary_factor"]) self.gguf_writer.add_rope_dimension_count(rope_dim) self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("layernorm_epsilon", 1e-5)) diff --git a/conversion/minicpm.py b/conversion/minicpm.py index e9a4c4a74dc2..e31b26a00808 100644 --- a/conversion/minicpm.py +++ b/conversion/minicpm.py @@ -32,11 +32,9 @@ def set_gguf_parameters(self): def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: rope_dims = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] - rope_scaling = self.find_hparam(['rope_scaling'], True) - if rope_scaling is not None: - long_factors = rope_scaling.get('long_factor', None) - short_factors = rope_scaling.get('short_factor', None) - + long_factors = self.rope_parameters.get('long_factor') + short_factors = self.rope_parameters.get('short_factor') + if long_factors or short_factors: if long_factors is None or short_factors is None: raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor') @@ -85,13 +83,11 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"]) def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: - rope_scaling = self.find_hparam(['rope_scaling'], True) - if rope_scaling is not None: + long_factors = self.rope_parameters.get('long_factor') + short_factors = self.rope_parameters.get('short_factor') + if long_factors or short_factors: rope_dims = self.hparams["qk_rope_head_dim"] - long_factors = rope_scaling.get('long_factor', None) - short_factors = rope_scaling.get('short_factor', None) - if long_factors is None or short_factors is None: raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor') diff --git a/conversion/nemotron.py b/conversion/nemotron.py index dfeeb9785822..e44688a78807 100644 --- a/conversion/nemotron.py +++ b/conversion/nemotron.py @@ -125,17 +125,18 @@ def set_gguf_parameters(self): self.gguf_writer.add_layer_norm_eps(f_norm_eps) # * Partial RoPE - rot_pct = self.find_hparam(["partial_rotary_factor", "rope_pct", "rope_percent"]) + rot_pct = self.rope_parameters["partial_rotary_factor"] n_embd = self.find_hparam(["hidden_size", "n_embd"]) n_head = self.find_hparam(["num_attention_heads", "n_head"]) self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head) # * RopeScaling for Nemotron - if "rope_scaling" not in self.hparams or self.hparams["rope_scaling"] is None: + factor = self.hparams.get("factor") or self.rope_parameters.get("factor") + if factor is None: self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) else: self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) - self.gguf_writer.add_rope_scaling_factor(self.hparams["factor"]) + self.gguf_writer.add_rope_scaling_factor(factor) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: # * Adding +1 to LayerNorm's weights here to implement layernorm1p w/o changing anything on the GGML engine side diff --git a/conversion/phi.py b/conversion/phi.py index 5e0d72847aa8..df4bfe809af7 100644 --- a/conversion/phi.py +++ b/conversion/phi.py @@ -18,7 +18,7 @@ class Phi2Model(TextModel): model_arch = gguf.MODEL_ARCH.PHI2 def set_gguf_parameters(self): - rot_pct = self.find_hparam(["partial_rotary_factor"]) + rot_pct = self.rope_parameters["partial_rotary_factor"] n_embd = self.find_hparam(["hidden_size", "n_embd"]) n_head = self.find_hparam(["num_attention_heads", "n_head"]) @@ -149,8 +149,8 @@ def set_gguf_parameters(self): n_head_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"]) rms_eps = self.find_hparam(["rms_norm_eps"]) max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"]) - orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"]) - rot_pct = self.hparams.get("partial_rotary_factor", 1.0) + orig_max_pos_embds = self.rope_parameters["original_max_position_embeddings"] + rot_pct = self.rope_parameters.get("partial_rotary_factor", 1.0) rope_dims = int(rot_pct * n_embd) // n_head self.gguf_writer.add_context_length(max_pos_embds) @@ -174,18 +174,19 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: n_embd = self.find_hparam(["hidden_size", "n_embd"]) n_head = self.find_hparam(["num_attention_heads", "n_head"]) max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"]) - orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"]) - rot_pct = self.hparams.get("partial_rotary_factor", 1.0) + orig_max_pos_embds = self.rope_parameters["original_max_position_embeddings"] + rot_pct = self.rope_parameters.get("partial_rotary_factor", 1.0) rope_dims = int(rot_pct * n_embd) // n_head # write rope scaling for long context (128k) model - rope_scaling = self.find_hparam(['rope_scaling'], True) - if rope_scaling is None: + long_factors = self.rope_parameters.get('long_factor') + short_factors = self.rope_parameters.get('short_factor') + if not long_factors: return scale = max_pos_embds / orig_max_pos_embds - rope_scaling_type = rope_scaling.get('rope_type', rope_scaling.get('type', '')).lower() + rope_scaling_type = self.rope_parameters.get('rope_type', '').lower() if len(rope_scaling_type) == 0: raise KeyError('Missing the required key rope_scaling.type') @@ -198,9 +199,6 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: self.gguf_writer.add_rope_scaling_attn_factors(attn_factor) - long_factors = rope_scaling.get('long_factor', None) - short_factors = rope_scaling.get('short_factor', None) - if long_factors is None or short_factors is None: raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor') diff --git a/conversion/qwen.py b/conversion/qwen.py index 7eb135c832d4..6b85eb9aaf88 100644 --- a/conversion/qwen.py +++ b/conversion/qwen.py @@ -280,7 +280,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_full_attention_interval(self.hparams.get("full_attention_interval", 4)) if (rope_dim := self.hparams.get("head_dim")) is None: rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] - self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.25))) + self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.rope_parameters.get("partial_rotary_factor", 0.25))) @classmethod def filter_tensors(cls, item: tuple[str, Callable[[], Tensor]]) -> tuple[str, Callable[[], Tensor]] | None: diff --git a/conversion/stablelm.py b/conversion/stablelm.py index ba5e9aa6ca9c..6e16378a031f 100644 --- a/conversion/stablelm.py +++ b/conversion/stablelm.py @@ -28,7 +28,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_embedding_length(hparams["hidden_size"]) self.gguf_writer.add_block_count(self.block_count) self.gguf_writer.add_feed_forward_length(hparams["intermediate_size"]) - rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"]) + rotary_factor = self.rope_parameters["partial_rotary_factor"] self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"]))) self.gguf_writer.add_head_count(hparams["num_attention_heads"]) self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"]) diff --git a/conversion/step3.py b/conversion/step3.py index 8c45b61c954a..49bb5244a62b 100644 --- a/conversion/step3.py +++ b/conversion/step3.py @@ -314,7 +314,7 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: factor = float(rope_params.get("factor", 8.0)) low_freq_factor = float(rope_params.get("low_freq_factor", 1.0)) high_freq_factor = float(rope_params.get("high_freq_factor", 4.0)) - old_context_len = int(rope_params.get("original_max_position_embeddings", self.hparams.get("original_max_position_embeddings", 8192))) + old_context_len = int(rope_params.get("original_max_position_embeddings", 8192)) low_freq_wavelen = old_context_len / low_freq_factor high_freq_wavelen = old_context_len / high_freq_factor diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index a6192c039a0a..3b23d5ebc0d3 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -153,6 +153,15 @@ def parse_args() -> argparse.Namespace: help="Store tensors dequantized from FP8 as Q8_0 instead of BF16/F16.", ) + parser.add_argument( + "--target-model-dir", type=str, default=None, + help=( + "path to the target model directory; required when converting a standalone draft model " + "(e.g. EAGLE3 / DFlash) that needs target-model metadata such as tokenizer, hidden size, and " + "layer count to populate its GGUF." + ), + ) + args = parser.parse_args() if not args.print_supported_models and args.model is None: parser.error("the following arguments are required: model") @@ -269,6 +278,7 @@ def main() -> None: small_first_shard=args.no_tensor_first_split, remote_hf_model_id=hf_repo_id, disable_mistral_community_chat_template=disable_mistral_community_chat_template, sentence_transformers_dense_modules=args.sentence_transformers_dense_modules, + target_model_dir=Path(args.target_model_dir) if args.target_model_dir else None, fuse_gate_up_exps=args.fuse_gate_up_exps, fp8_as_q8=args.fp8_as_q8, ) diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index b4c8a7cf00a3..91c006278f14 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -100,6 +100,7 @@ class TOKENIZER_TYPE(IntEnum): {"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", }, {"name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", }, {"name": "tiny_aya", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereLabs/tiny-aya-base", }, + {"name": "cohere2moe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereLabs/North-Mini-Code-1.0", }, {"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", }, {"name": "olmo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", }, {"name": "dbrx", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", }, diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py index 45202b333871..47c09af53fb5 100755 --- a/convert_lora_to_gguf.py +++ b/convert_lora_to_gguf.py @@ -25,7 +25,7 @@ from gguf.constants import GGUFValueType # reuse model definitions from the conversion/ package -from conversion import LazyTorchTensor, ModelBase, get_model_class +from conversion import LazyTorchTensor, ModelBase, get_model_class, ModelType, get_model_architecture logger = logging.getLogger("lora-to-gguf") @@ -396,12 +396,12 @@ def load_hparams_from_hf(hf_model_id: str, trust_remote_code: bool) -> tuple[dic hparams = ModelBase.load_hparams(dir_base_model, False) with torch.inference_mode(): + model_arch = get_model_architecture(hparams, ModelType.TEXT) try: - model_arch = hparams.get("text_config", {}).get("architectures", hparams["architectures"])[0] - logger.info("Using model architecture: %s", model_arch) model_class = get_model_class(model_arch) + logger.info("Using model architecture: %s", model_arch) except NotImplementedError: - logger.error(f"Model {hparams['architectures'][0]} is not supported") + logger.error(f"Model {model_arch} is not supported") sys.exit(1) class LoraModel(model_class): # ty: ignore[unsupported-base] diff --git a/docs/android.md b/docs/android.md index 964ce8a1f05a..e8d580a9ed5d 100644 --- a/docs/android.md +++ b/docs/android.md @@ -29,7 +29,7 @@ With Termux, you can install and run `llama.cpp` as if the environment were Linu ``` $ apt update && apt upgrade -y -$ apt install git cmake +$ apt install git cmake libandroid-spawn ``` Then, follow the [build instructions](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md), specifically for CMake. diff --git a/docs/backend/CUDA-FEDORA.md b/docs/backend/CUDA-FEDORA.md index 1508faf776d2..f76a61dfcea3 100644 --- a/docs/backend/CUDA-FEDORA.md +++ b/docs/backend/CUDA-FEDORA.md @@ -270,7 +270,7 @@ You have successfully set up CUDA on Fedora within a toolbox environment using t --- -**Disclaimer:** Manually installing and modifying system packages can lead to instability of the container. The above steps are provided as a guideline and may need adjustments based on your specific system configuration. Always back up important data before making significant system changes, especially as your home folder is writable and shared with he toolbox. +**Disclaimer:** Manually installing and modifying system packages can lead to instability of the container. The above steps are provided as a guideline and may need adjustments based on your specific system configuration. Always back up important data before making significant system changes, especially as your home folder is writable and shared with the toolbox. **Acknowledgments:** Special thanks to the Fedora community and NVIDIA documentation for providing resources that assisted in creating this guide. diff --git a/docs/backend/OPENVINO.md b/docs/backend/OPENVINO.md index b0e19abb0901..d5c6f46e299d 100644 --- a/docs/backend/OPENVINO.md +++ b/docs/backend/OPENVINO.md @@ -12,6 +12,25 @@ The OpenVINO backend is implemented in `ggml/src/ggml-openvino` and provides a t - Compiles and caches the model for the target device. - Binds GGML tensor memory to OpenVINO inference tensors and runs inference. +## Contents + +- [Supported Devices](#supported-devices) +- [Supported Model Precisions](#supported-model-precisions) +- [Supported Llama.cpp Tools](#supported-llamacpp-tools) +- [Validated Models](#validated-models) +- [Build Instructions](#build-instructions) + - [0. Prerequisites](#0-prerequisites) + - [1. Install OpenVINO Runtime](#1-install-openvino-runtime) + - [2. Build llama.cpp with OpenVINO Backend](#2-build-llamacpp-with-openvino-backend) + - [Automated Ubuntu Build Script](#automated-ubuntu-build-script) + - [Automated Windows Build Script](#automated-windows-build-script) + - [3. Download Sample Model](#3-download-sample-model) + - [4. Run Inference with OpenVINO Backend](#4-run-inference-with-openvino-backend) + - [5. Docker Build](#5-docker-build) +- [GGML OpenVINO Backend Runtime Configurations](#ggml-openvino-backend-runtime-configurations) +- [Known Limitations](#known-limitations) +- [Work in Progress](#work-in-progress) + ## Supported Devices OpenVINO backend supports the following hardware: @@ -31,55 +50,102 @@ Although OpenVINO supports a wide range of [Intel hardware](https://docs.openvin - `Q4_1` - `Q4_K` - `Q4_K_M` -- `Q5_K` (converted to Q8_0_C at runtime) -- `Q6_K` (converted to Q8_0_C at runtime) +- `Q5_K` (converted to `Q8_0_C` at runtime) +- `Q6_K` (converted to `Q8_0_C` at runtime) > [!NOTE] > Accuracy validation and performance optimizations for quantized models are a work in progress. -## Quantization Support Details - -### CPU and GPU - -- **`Q4_0`, `Q4_1`, `Q4_K_M`, `Q6_K` models are supported** +**CPU and GPU Quantization Details:** - `Q5_K` and `Q6_K` tensors are converted to `Q8_0_C` -### NPU - -- **Primary supported quantization scheme is `Q4_0`** +**NPU Quantization Details:** +- Primary supported quantization scheme is `Q4_0` - `Q6_K` tensors are requantized to `Q4_0_128` in general. For embedding weights, `Q6_K` tensors are requantized to `Q8_0_C` except for the token embedding matrix which is dequantized to fp16 -### Additional Notes - +**Additional Notes:** - Both `Q4_0` and `Q4_1` models use `Q6_K` for the token embedding tensor and the final matmul weight tensor (often the same tensor) - `Q4_0` models may produce some `Q4_1` tensors if an imatrix is provided during quantization using `llama-quantize` - `Q4_K_M` models may include both `Q6_K` and `Q5_K` tensors (observed in Phi-3) +- `Q5_1` tensors are dequantized natively (weights, scales, and zero-points extracted directly) + +## Supported Llama.cpp Tools + +The OpenVINO backend integrates with the standard llama.cpp tools listed below. +However, all the tools coverage across all devices is not uniform and exhaustive validation is work in progress. + +- llama-bench +- llama-cli +- llama-completion +- llama-embedding +- llama-perplexity +- llama-run +- llama-server +- llama-simple ## Validated Models -The following models were validated on Intel® Core™ Ultra Series 2. While our testing was limited, the OpenVINO backend is expected to work across a broad range of [Intel hardware](https://docs.openvino.ai/2026/about-openvino/release-notes-openvino/system-requirements.html). -- Use `GGML_OPENVINO_STATEFUL_EXECUTION=1` when using GPU device. -- `-fa 1` is required when running llama-bench with the OpenVINO backend. -- Additional model support, quantization formats and validations are work in progress. - -| Model | Validated | Known Issues | -| :------| :---------- | :-------------| -| [Llama-3.2-1B-Instruct](https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-GGUF/) | `FP16`, `Q8_0`, `Q4_0`, `Q4_1`, `Q4_K_M` on CPU/GPU/NPU | — | -| [Meta-Llama-3.1-8B-Instruct](https://huggingface.co/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF) | `Q8_0`, `Q4_K_M` on CPU/GPU/NPU | `Q4_0_8_8`, `Q4_0_4_8`, `Q4_0_4_4` fail | -| [Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf) | `FP16`, `Q4` on CPU/NPU | GPU unsupported for `FP16` and `Q4` (`llama-cli`, `llama-bench`) | -| [Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF) | `FP16`, `Q8_0`, `Q4_0`, `Q4_1`, `Q4_K_M` on CPU/GPU/NPU | — | -| [Qwen3-8B-Instruct](https://huggingface.co/Qwen/Qwen3-8B-GGUF) | `FP16`, `Q8_0`, `Q4_0`, `Q4_1`, `Q4_K_M` on CPU/NPU; GPU works via `llama-bench` | GPU `llama-cli` unsupported for all quantizations | -| [MiniCPM-V-2_6-GGUF](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) | `Q4_0` on CPU/GPU/NPU | — | -| [DeepSeek-R1-Distill-Llama-8B](https://huggingface.co/bartowski/DeepSeek-R1-Distill-Llama-8B-GGUF) | `Q8_0`, `Q4_0`, `Q4_1`, `Q4_K_M` on CPU/GPU/NPU | — | -| [Hunyuan-7B-Instruct](https://huggingface.co/bartowski/tencent_Hunyuan-7B-Instruct-GGUF) | CPU: `Q8_0`, `Q4_0`, `Q4_1`, `Q4_K_M`; GPU: `Q8_0`, `Q4_0`, `Q4_1`; NPU (`llama-bench` only): `Q4_0`, `Q4_1`, `Q4_K_M` | GPU `Q4_K_M` unsupported; NPU `llama-cli` unsupported | -| [Mistral-7B-Instruct-v0.3](https://huggingface.co/bartowski/Mistral-7B-Instruct-v0.3-GGUF/) | CPU/GPU: `Q8_0`, `Q4_K_M`; NPU: `Q8_0`, `Q4_K_M` (via `llama-bench`) | NPU `llama-cli` unsupported for `Q8_0`, `Q4_K_M` | +Although, the validated models below were tested with `llama-cli` using the `Q4_K_M` quantization format on Intel® Core™ Ultra Series 2 (Lunar Lake), the OpenVINO backend is expected to work across a broader range of [Intel hardware](https://docs.openvino.ai/2026/about-openvino/release-notes-openvino/system-requirements.html), [supported model precisions](#supported-model-precisions), [supported llama.cpp tools](#supported-llamacpp-tools) and additional model architectures. + +> [!NOTE] +> Extensive accuracy validation, performance optimizations, and broader architecture coverage are work in progress. + +**Legend & Test Configuration:** +- **Status:** ✓ = Passed | ✗ = Failed or Unsupported +- **Execution Modes:** + - **SL** = Stateless (`GGML_OPENVINO_STATEFUL_EXECUTION=0`) + - **SF** = Stateful (`GGML_OPENVINO_STATEFUL_EXECUTION=1`) + - Note: The NPU operates in stateless mode only. +- **Validation system:** Intel® Core™ Ultra 5 238V (Lunar Lake) | 32 GB RAM | Ubuntu 24.04 | Intel OpenCL GPU Driver 26.18.38308.1 | Intel NPU Driver 1.33.0. +- See [Known Limitations](#known-limitations) for context on observed failures. + +| Model | CPU (SL / SF) | GPU (SL / SF) | NPU (SL) | +| :--- | :---: | :---: | :---: | +| [bartowski/Llama-3.2-1B-Instruct-Q4_K_M](https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF) | ✓ / ✓ | ✓ / ✓ | ✓ | +| [bartowski/Llama-3.2-3B-Instruct-Q4_K_M](https://huggingface.co/bartowski/Llama-3.2-3B-Instruct-GGUF) | ✓ / ✓ | ✓ / ✓ | ✓ | +| [bartowski/Meta-Llama-3.1-8B-Instruct-Q4_K_M](https://huggingface.co/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF) | ✓ / ✓ | ✓ / ✓ | ✓ | +| | | | | +| [Qwen/qwen2.5-1.5b-instruct-q4_k_m](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF) | ✓ / ✓ | ✓ / ✗ | ✓ | +| [Qwen/qwen2.5-coder-7b-instruct-q4_k_m](https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF) | ✓ / ✓ | ✓ / ✗ | ✓ | +| [bartowski/Qwen_Qwen3-0.6B-Q4_K_M](https://huggingface.co/bartowski/Qwen_Qwen3-0.6B-GGUF) | ✓ / ✓ | ✓ / ✗ | ✓ | +| [bartowski/Qwen_Qwen3-1.7B-Q4_K_M](https://huggingface.co/bartowski/Qwen_Qwen3-1.7B-GGUF) | ✓ / ✓ | ✓ / ✗ | ✓ | +| [Qwen/Qwen3-4B-Q4_K_M](https://huggingface.co/Qwen/Qwen3-4B-GGUF) | ✓ / ✓ | ✓ / ✗ | ✓ | +| [lm-kit/Qwen3-8B-Q4_K_M](https://huggingface.co/lm-kit/qwen-3-8b-instruct-gguf) | ✓ / ✓ | ✓ / ✗ | ✓ | +| | | | | +| [unsloth/gemma-3-4b-it-Q4_K_M](https://huggingface.co/unsloth/gemma-3-4b-it-GGUF) | ✓ / ✓ | ✓ / ✗ | ✓ | +| [bartowski/google_gemma-4-E2B-it-Q4_K_M](https://huggingface.co/bartowski/google_gemma-4-E2B-it-GGUF) | ✓ / ✗ | ✓ / ✗ | ✓ | +| [bartowski/google_gemma-4-E4B-it-Q4_K_M](https://huggingface.co/bartowski/google_gemma-4-E4B-it-GGUF) | ✓ / ✗ | ✓ / ✗ | ✓ | +| [bartowski/gemma-4-12B-it-Q4_K_M](https://huggingface.co/bartowski/gemma-4-12B-it-GGUF) | ✓ / ✗ | ✓ / ✗ | ✗ | +| | | | | +| [bartowski/Phi-3-mini-4k-instruct-Q4_K_M](https://huggingface.co/bartowski/Phi-3-mini-4k-instruct-GGUF) | ✓ / ✓ | ✓ / ✗ | ✓ | +| [bartowski/Phi-3.5-mini-instruct-Q4_K_M](https://huggingface.co/bartowski/Phi-3.5-mini-instruct-GGUF) | ✓ / ✓ | ✓ / ✗ | ✓ | +| | | | | +| [bartowski/Mistral-7B-Instruct-v0.3-Q4_K_M](https://huggingface.co/bartowski/Mistral-7B-Instruct-v0.3-GGUF) | ✓ / ✓ | ✓ / ✓ | ✓ | +| [QuantFactory/Ministral-3b-instruct.Q4_K_M](https://huggingface.co/QuantFactory/Ministral-3b-instruct-GGUF) | ✓ / ✓ | ✓ / ✓ | ✓ | +| [bartowski/Ministral-8B-Instruct-2410-Q4_K_M](https://huggingface.co/bartowski/Ministral-8B-Instruct-2410-GGUF) | ✓ / ✓ | ✓ / ✓ | ✓ | +| | | | | +| [bartowski/DeepSeek-R1-Distill-Llama-8B-Q4_K_M](https://huggingface.co/bartowski/DeepSeek-R1-Distill-Llama-8B-GGUF) | ✓ / ✓ | ✓ / ✓ | ✓ | +| [bartowski/DeepSeek-R1-Distill-Qwen-7B-Q4_K_M](https://huggingface.co/bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF) | ✓ / ✓ | ✓ / ✗ | ✓ | +| | | | | +| [ibm-granite/granite-4.0-350m-Q4_K_M](https://huggingface.co/ibm-granite/granite-4.0-350m-GGUF) | ✓ / ✓ | ✗ / ✗ | ✓ | +| [ibm-granite/granite-4.0-micro-Q4_K_M](https://huggingface.co/ibm-granite/granite-4.0-micro-GGUF) | ✓ / ✓ | ✓ / ✓ | ✓ | +| [ibm-granite/granite-4.0-1b-Q4_K_M](https://huggingface.co/ibm-granite/granite-4.0-1b-GGUF) | ✓ / ✓ | ✗ / ✗ | ✗ | +| [ibm-research/granite-3.2-8b-instruct-Q4_K_M](https://huggingface.co/ibm-research/granite-3.2-8b-instruct-GGUF) | ✓ / ✓ | ✓ / ✓ | ✓ | +| | | | | +| [HuggingFaceTB/smollm2-1.7b-instruct-q4_k_m](https://huggingface.co/HuggingFaceTB/SmolLM2-1.7B-Instruct-GGUF) | ✓ / ✓ | ✓ / ✓ | ✓ | +| [openbmb/MiniCPM-V-2_6-Q4_K_M](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) | ✓ / ✓ | ✓ / ✗ | ✓ | +| [bartowski/tencent_Hunyuan-7B-Instruct-Q4_K_M](https://huggingface.co/bartowski/tencent_Hunyuan-7B-Instruct-GGUF) | ✓ / ✓ | ✓ / ✗ | ✓ | +| [LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct-Q4_K_M](https://huggingface.co/LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct-GGUF) | ✓ / ✓ | ✓ / ✗ | ✓ | +| [bartowski/prism-ml_Bonsai-8B-unpacked-Q4_K_M](https://huggingface.co/bartowski/prism-ml_Bonsai-8B-unpacked-GGUF) | ✓ / ✓ | ✓ / ✗ | ✓ | +| | | | | +| [gpustack/bge-m3-Q4_K_M.gguf](https://huggingface.co/gpustack/bge-m3-GGUF) | ✓ | ✗ | ✗ | ## Build Instructions -### Prerequisites +### 0. Prerequisites - Linux or Windows system with Intel hardware (CPU, GPU, or NPU) -- **For Intel GPU or NPU Usage**: Install the appropriate hardware drivers for your Intel GPU or NPU. For detailed instructions, see: [Additional Configurations for Hardware Acceleration](https://docs.openvino.ai/2025/get-started/install-openvino/configurations.html). +- **For Intel GPU or NPU Usage**: Install the appropriate hardware drivers for your Intel GPU or NPU. For detailed instructions, see: [Additional Configurations for Hardware Acceleration](https://docs.openvino.ai/2026/get-started/install-openvino/configurations.html). - **Linux:** - Git, CMake, and Ninja software tools are needed for building. @@ -119,68 +185,390 @@ The following models were validated on Intel® Core™ Ultra Series 2. While our - Follow the guide to install OpenVINO Runtime from an archive file: [Linux](https://docs.openvino.ai/2026/get-started/install-openvino/install-openvino-archive-linux.html) | [Windows](https://docs.openvino.ai/2026/get-started/install-openvino/install-openvino-archive-windows.html) +- Verify OpenVINO is initialized properly: + ```bash + echo $OpenVINO_DIR + ``` + +### 2. Build llama.cpp with OpenVINO Backend + +Clone llama.cpp repo and build : + +```bash +git clone https://github.com/ggml-org/llama.cpp +cd llama.cpp +``` + - **Linux:** +```bash +source /opt/intel/openvino/setupvars.sh +cmake -B build/ReleaseOV -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_OPENVINO=ON +cmake --build build/ReleaseOV --parallel +``` -
- 📦 Click to expand OpenVINO installation from an archive file on Ubuntu -
+- **Windows:** Open a **Developer Command Prompt for VS 2022** (so the MSVC toolchain is on `PATH`), then run: - ```bash - wget https://raw.githubusercontent.com/ravi9/misc-scripts/main/openvino/ov-archive-install/install-openvino-from-archive.sh - chmod +x install-openvino-from-archive.sh - ./install-openvino-from-archive.sh - ``` +```cmd +C:\Intel\openvino\setupvars.bat +cmake -B build\ReleaseOV -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_OPENVINO=ON -DCMAKE_TOOLCHAIN_FILE=C:\vcpkg\scripts\buildsystems\vcpkg.cmake +cmake --build build\ReleaseOV --parallel +``` - Verify OpenVINO is initialized properly: - ```bash - echo $OpenVINO_DIR - ``` -
+> [!NOTE] +> The Windows install path is `C:\Intel\openvino` (no spaces) to avoid quoting problems some CMake/Ninja toolchains have with `C:\Program Files (x86)\...`. Adjust to wherever you installed OpenVINO Runtime. From `cmd`, run `C:\Intel\openvino\setupvars.bat`; from PowerShell, run `& "C:\Intel\openvino\setupvars.ps1"` instead. Once the build is finished you can launch the binaries from any `cmd` or `PowerShell` window after sourcing the matching `setupvars` script for that shell. +#### Automated Ubuntu Build Script -### 2. Build llama.cpp with OpenVINO Backend +For Ubuntu24 users, the following shell script automates the prerequisite installs (build tools, OpenCL ICD), the OpenVINO Runtime download/extract/setup, and the Ninja-based llama.cpp build. +Save the following as `ubuntu-llamacpp-ov-install.sh` next to where you want the `llama.cpp` folder to land, then run it: -Clone the OpenVINO-enabled llama.cpp fork and build it: +```bash +chmod +x ubuntu-llamacpp-ov-install.sh +./ubuntu-llamacpp-ov-install.sh +``` + +
+Click to expand ubuntu-llamacpp-ov-install.sh ```bash -git clone https://github.com/ggml-org/llama.cpp -cd llama.cpp +#!/usr/bin/env bash +# ============================================ +# llama.cpp OpenVINO Build Script (Ninja) +# ============================================ +set -euo pipefail + +OPENVINO_VERSION_MAJOR="2026.2.1" +OPENVINO_VERSION_FULL="2026.2.1.21919.ede283a88e3" + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +OPENVINO_INSTALL_DIR="/opt/intel/openvino_${OPENVINO_VERSION_MAJOR}" +OPENVINO_LINK_DIR="/opt/intel/openvino" +OPENVINO_TGZ="${SCRIPT_DIR}/openvino.tgz" +OPENVINO_URL="https://storage.openvinotoolkit.org/repositories/openvino/packages/${OPENVINO_VERSION_MAJOR}/linux/openvino_toolkit_ubuntu24_${OPENVINO_VERSION_FULL}_x86_64.tgz" + +echo "============================================" +echo "Installing prerequisites (apt)..." +echo "============================================" +sudo apt-get update +sudo apt-get install -y \ + build-essential libcurl4-openssl-dev libtbb12 \ + cmake ninja-build python3-pip \ + curl wget tar git + +echo "============================================" +echo "Installing OpenCL runtime + headers..." +echo "============================================" +sudo apt-get install -y \ + ocl-icd-opencl-dev opencl-headers opencl-clhpp-headers intel-opencl-icd + +cd "${SCRIPT_DIR}" + +# ============================================ +# Clone llama.cpp if missing +# ============================================ +if [[ ! -f "llama.cpp/CMakeLists.txt" ]]; then + echo "Cloning llama.cpp..." + git clone https://github.com/ggml-org/llama.cpp +fi + +# ============================================ +# Setup OpenVINO: download & extract to /opt/intel/openvino_${OPENVINO_VERSION_MAJOR}, +# then point /opt/intel/openvino at it via symlink so the active version is swappable. +# ============================================ +if [[ -f "${OPENVINO_INSTALL_DIR}/setupvars.sh" ]]; then + echo "OpenVINO ${OPENVINO_VERSION_MAJOR} already installed at ${OPENVINO_INSTALL_DIR}. Skipping download." +else + echo "OpenVINO not found at ${OPENVINO_INSTALL_DIR}. Starting download..." + curl -L -o "${OPENVINO_TGZ}" "${OPENVINO_URL}" + + echo "Extracting OpenVINO to ${OPENVINO_INSTALL_DIR}..." + sudo mkdir -p "${OPENVINO_INSTALL_DIR}" + sudo tar -xzf "${OPENVINO_TGZ}" -C "${OPENVINO_INSTALL_DIR}" --strip-components=1 + rm -f "${OPENVINO_TGZ}" +fi + +# Refresh symlink: /opt/intel/openvino -> /opt/intel/openvino_${OPENVINO_VERSION_MAJOR} +sudo ln -sfn "${OPENVINO_INSTALL_DIR}" "${OPENVINO_LINK_DIR}" + +OPENVINO_ROOT="${OPENVINO_LINK_DIR}" +echo "OpenVINO Ready: ${OPENVINO_ROOT} -> ${OPENVINO_INSTALL_DIR}" + +# Install OpenVINO's own runtime dependencies (one-time per system). +if [[ -x "${OPENVINO_ROOT}/install_dependencies/install_openvino_dependencies.sh" ]]; then + echo "============================================" + echo "Installing OpenVINO runtime dependencies..." + echo "============================================" + echo "Y" | sudo -E "${OPENVINO_ROOT}/install_dependencies/install_openvino_dependencies.sh" +fi + +# ============================================ +# Clean old build cache +# ============================================ +cd "${SCRIPT_DIR}/llama.cpp" +if [[ -d "build/ReleaseOV" ]]; then + echo "Removing old build directory..." + rm -rf "build/ReleaseOV" +fi + +echo "============================================" +echo "Configuring with CMake..." +echo "============================================" +# shellcheck disable=SC1091 +source "${OPENVINO_ROOT}/setupvars.sh" + +cmake -B build/ReleaseOV -G Ninja \ + -DCMAKE_BUILD_TYPE=Release \ + -DGGML_OPENVINO=ON + +cmake --build build/ReleaseOV --parallel + +echo "============================================" +echo "Build completed successfully!" +echo "============================================" +echo "Binaries: $(pwd)/build/ReleaseOV/bin" +echo +echo "NOTE: To run, source setupvars.sh and pick a device:" +echo " source /opt/intel/openvino/setupvars.sh" +echo " export GGML_OPENVINO_DEVICE=CPU # or GPU / NPU" +echo " ./build/ReleaseOV/bin/llama-cli -m model.gguf" ``` -- **Linux:** - ```bash - source /opt/intel/openvino/setupvars.sh - cmake -B build/ReleaseOV -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_OPENVINO=ON - cmake --build build/ReleaseOV --parallel - ``` +> [!NOTE] +> The script pins OpenVINO `2026.2.1` via the `OPENVINO_VERSION_MAJOR` / `OPENVINO_VERSION_FULL` variables at the top — edit them to track a different release. + +
+ +#### Automated Windows Build Script + +For Windows users, the following `.bat` script automates the prerequisite installs (Git, Ninja, CMake, Visual Studio 2022 Build Tools, vcpkg + OpenCL), the OpenVINO Runtime download/extract, and the Ninja-based llama.cpp build. +Save the following as `windows-llamacpp-ov-install.bat` next to where you want the `llama.cpp` to land, then run it from either **Command Prompt** or **PowerShell**: + +```cmd +:: Command Prompt +windows-llamacpp-ov-install.bat +``` + +```powershell +# PowerShell +.\windows-llamacpp-ov-install.bat +``` + +
+Click to expand windows-llamacpp-ov-install.bat + +```bat +@echo off +setlocal enabledelayedexpansion + +REM ============================================ +REM llama.cpp OpenVINO Build Script (Ninja) +REM ============================================ + +set "OPENVINO_VERSION_MAJOR=2026.2.1" +set "OPENVINO_VERSION_FULL=2026.2.1.21919.ede283a88e3" + +set "SCRIPT_DIR=%~dp0" +set "VCPKG_DIR=C:\vcpkg" +set "OPENVINO_INSTALL_DIR=C:\Intel\openvino_%OPENVINO_VERSION_MAJOR%" +set "OPENVINO_LINK_DIR=C:\Intel\openvino" +set "OPENVINO_ZIP=%SCRIPT_DIR%openvino.zip" +set "OPENVINO_EXTRACT_TMP=%SCRIPT_DIR%openvino_extract_tmp" +set "OPENVINO_URL=https://storage.openvinotoolkit.org/repositories/openvino/packages/%OPENVINO_VERSION_MAJOR%/windows/openvino_toolkit_windows_%OPENVINO_VERSION_FULL%_x86_64.zip" + +echo ============================================ +echo Installing prerequisites... +echo ============================================ +winget install --id Git.Git -e --accept-source-agreements --accept-package-agreements 2>nul +winget install --id Ninja-build.Ninja -e --accept-source-agreements --accept-package-agreements 2>nul +winget install --id Kitware.CMake -e --accept-source-agreements --accept-package-agreements 2>nul + +REM Ensure Visual Studio Build Tools are installed. +echo Checking for Visual Studio Build Tools... +set "VSWHERE=%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" +set "VS_INSTALLED=" +if exist "%VSWHERE%" ( + for /f "usebackq tokens=*" %%i in (`"%VSWHERE%" -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath 2^>nul`) do ( + set "VS_INSTALLED=%%i" + ) +) +if defined VS_INSTALLED ( + echo Visual Studio with VC++ x86/x64 tools already present at "!VS_INSTALLED!". Skipping winget install. +) else ( + winget install --id Microsoft.VisualStudio.2022.BuildTools -e --override "--wait --passive --add Microsoft.VisualStudio.Workload.VCTools --includeRecommended" --accept-source-agreements --accept-package-agreements + if errorlevel 1 ( + echo WARNING: winget could not install Visual Studio Build Tools automatically. + echo Install manually from https://aka.ms/vs/17/release/vs_BuildTools.exe ^(select the "Desktop development with C++" workload^) + echo and re-run this script from a "Developer Command Prompt for VS 2022". + ) +) + +echo ============================================ +echo Installing OpenCL via vcpkg... +echo ============================================ +if not exist "%VCPKG_DIR%" ( + git clone https://github.com/microsoft/vcpkg "%VCPKG_DIR%" + cd /d "%VCPKG_DIR%" + call bootstrap-vcpkg.bat + call vcpkg integrate install +) +cd /d "%VCPKG_DIR%" +call vcpkg install opencl + +cd /d "%SCRIPT_DIR%" + +REM ============================================ +REM Clone llama.cpp if missing +REM ============================================ +if not exist "llama.cpp\CMakeLists.txt" ( + echo Cloning llama.cpp... + git clone https://github.com/ggml-org/llama.cpp +) + +cd /d "llama.cpp" +set "SCRIPT_DIR=%CD%" + +REM ============================================ +REM Setup OpenVINO: download & extract to C:\Intel\openvino_%OPENVINO_VERSION_MAJOR%, +REM then point C:\Intel\openvino at it via a directory junction (mklink /J). +REM ============================================ + +if exist "%OPENVINO_INSTALL_DIR%\setupvars.bat" ( + echo OpenVINO %OPENVINO_VERSION_MAJOR% already installed at "%OPENVINO_INSTALL_DIR%". Skipping download. +) else ( + echo OpenVINO not found at "%OPENVINO_INSTALL_DIR%". Starting download... + + curl -L -o "%OPENVINO_ZIP%" "%OPENVINO_URL%" + if errorlevel 1 ( + echo ERROR: Download failed. + exit /b 1 + ) + + echo Extracting OpenVINO... + if exist "%OPENVINO_EXTRACT_TMP%" rmdir /s /q "%OPENVINO_EXTRACT_TMP%" + mkdir "%OPENVINO_EXTRACT_TMP%" + tar -xf "%OPENVINO_ZIP%" -C "%OPENVINO_EXTRACT_TMP%" + if errorlevel 1 ( + echo ERROR: Extraction failed. + exit /b 1 + ) + + REM Move the single top-level folder contents into the versioned install dir. + REM NOTE: delayed expansion (!VAR!) is required because the surrounding else( ... ) + REM block is parsed once up-front, so %OPENVINO_EXTRACTED% would expand to "" here + REM and xcopy would then treat "\*" as C:\* and fail with "Cannot perform a cyclic copy". + set "OPENVINO_EXTRACTED=" + for /d %%i in ("%OPENVINO_EXTRACT_TMP%\*") do set "OPENVINO_EXTRACTED=%%i" + if not defined OPENVINO_EXTRACTED ( + echo ERROR: Could not locate extracted OpenVINO folder under "%OPENVINO_EXTRACT_TMP%". + exit /b 1 + ) + if not exist "%OPENVINO_INSTALL_DIR%" mkdir "%OPENVINO_INSTALL_DIR%" + xcopy /e /i /y /q "!OPENVINO_EXTRACTED!\*" "%OPENVINO_INSTALL_DIR%\" >nul + if errorlevel 1 ( + echo ERROR: Failed to copy OpenVINO from "!OPENVINO_EXTRACTED!" to "%OPENVINO_INSTALL_DIR%". + echo Re-run this script from an elevated Command Prompt ^(Run as administrator^) if access is denied. + exit /b 1 + ) + + rmdir /s /q "%OPENVINO_EXTRACT_TMP%" + del "%OPENVINO_ZIP%" +) + +REM Refresh junction: C:\Intel\openvino -> C:\Intel\openvino_. +REM `mklink /J` creates a directory junction (no admin / Developer Mode required). +if exist "%OPENVINO_LINK_DIR%" rmdir "%OPENVINO_LINK_DIR%" +mklink /J "%OPENVINO_LINK_DIR%" "%OPENVINO_INSTALL_DIR%" >nul +if errorlevel 1 ( + echo ERROR: Failed to create junction "%OPENVINO_LINK_DIR%" -^> "%OPENVINO_INSTALL_DIR%". + echo If "%OPENVINO_LINK_DIR%" already exists as a regular non-empty folder, remove it manually and re-run. + exit /b 1 +) + +set "OPENVINO_ROOT=%OPENVINO_LINK_DIR%" +echo OpenVINO Ready: %OPENVINO_ROOT% -^> %OPENVINO_INSTALL_DIR% + + +echo ============================================ +echo Setting up compiler environment... +echo ============================================ +REM Locate Visual Studio Build Tools vcvars64.bat +set "VSWHERE=%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" +if exist "%VSWHERE%" ( + for /f "usebackq tokens=*" %%i in (`"%VSWHERE%" -latest -products Microsoft.VisualStudio.Product.BuildTools -property installationPath`) do ( + set "VS_PATH=%%i" + ) +) +if defined VS_PATH ( + call "%VS_PATH%\VC\Auxiliary\Build\vcvars64.bat" >nul +) else ( + echo WARNING: Visual Studio Build Tools not found. Compiler may be missing. +) + +REM ============================================ +REM Clean old build cache +REM ============================================ +if exist "build\ReleaseOV" ( + echo Removing old build directory ... + rmdir /s /q "build\ReleaseOV" +) + +echo ============================================ +echo Configuring with CMake... +echo ============================================ +call "%OPENVINO_ROOT%\setupvars.bat" >nul 2>nul + +cmake -B build\ReleaseOV -G Ninja ^ + -DCMAKE_BUILD_TYPE=Release ^ + -DGGML_OPENVINO=ON ^ + -DCMAKE_TOOLCHAIN_FILE="%VCPKG_DIR%\scripts\buildsystems\vcpkg.cmake" + +if errorlevel 1 ( + echo If you continue to face CMAKE errors, make sure to install: + echo winget install Microsoft.VisualStudio.2022.BuildTools + echo Then run the "Developer Command Prompt for VS 2022" and launch this script from there. + exit /b 1 +) + +cmake --build build\ReleaseOV --config Release +if errorlevel 1 exit /b 1 + +echo ============================================ +echo Build completed successfully! +echo ============================================ +echo Binaries: %CD%\build\ReleaseOV\bin +echo. +echo NOTE: To run, source setupvars.bat and pick a device: +echo call "C:\Intel\openvino\setupvars.bat" +echo set GGML_OPENVINO_DEVICE=CPU ^&^& REM or GPU / NPU +echo build\ReleaseOV\bin\llama-cli.exe -m model.gguf +echo. + +endlocal +``` -- **Windows:** - ```cmd - # x64 Native Tools Command Prompt for VS 2022 - "C:\Program Files (x86)\Intel\openvino_2026.0\setupvars.bat" - cmake -B build\ReleaseOV -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_OPENVINO=ON -DLLAMA_CURL=OFF -DCMAKE_TOOLCHAIN_FILE=C:\vcpkg\scripts\buildsystems\vcpkg.cmake - cmake --build build\ReleaseOV --parallel - ``` > [!NOTE] -> Use `x64 Native Tools Command Prompt` for Windows build. After building, you could use either `cmd` or `PowerShell` to run the OpenVINO backend. +> The script pins OpenVINO `2026.2.1` via the `OPENVINO_VERSION_MAJOR` / `OPENVINO_VERSION_FULL` variables at the top — edit them to track a different release. From any new shell, source the matching `setupvars` script via the junction — `call "C:\Intel\openvino\setupvars.bat"` from `cmd`, or `& "C:\Intel\openvino\setupvars.ps1"` from PowerShell. If `winget` cannot register Visual Studio Build Tools on first run, install them once manually and re-run the script from an elevated **Developer Command Prompt for VS 2022**. + +
+ ### 3. Download Sample Model -Download models for testing: +Download sample model for testing. ```bash # Linux mkdir -p ~/models/ -wget https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf \ - -O ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf +wget https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_K_M.gguf \ + -O ~/models/Llama-3.2-1B-Instruct-Q4_K_M.gguf # Windows PowerShell mkdir C:\models -Invoke-WebRequest -Uri https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf -OutFile C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf +Invoke-WebRequest -Uri https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_K_M.gguf -OutFile C:\models\Llama-3.2-1B-Instruct-Q4_K_M.gguf # Windows Command Line mkdir C:\models -curl -L https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf -o C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf +curl -L https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_K_M.gguf -o C:\models\Llama-3.2-1B-Instruct-Q4_K_M.gguf ``` ### 4. Run Inference with OpenVINO Backend @@ -196,65 +584,45 @@ When using the OpenVINO backend, the first inference token may have slightly hig # Linux export GGML_OPENVINO_DEVICE=GPU -# Enable stateful execution with GPU device to avoid known stateless execution failures. +# Optional: enable stateful execution for improved GPU performance (recommended). export GGML_OPENVINO_STATEFUL_EXECUTION=1 # To run llama-simple: -./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf -n 50 "The story of AI is " +./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct-Q4_K_M.gguf -n 50 "The story of AI is " # To run in chat mode: -./build/ReleaseOV/bin/llama-cli -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf -c 1024 +./build/ReleaseOV/bin/llama-cli -m ~/models/Llama-3.2-1B-Instruct-Q4_K_M.gguf -c 1024 # To run llama-bench, -fa 1 is needed -GGML_OPENVINO_STATEFUL_EXECUTION=1 GGML_OPENVINO_DEVICE=GPU ./build/ReleaseOV/bin/llama-bench -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf -fa 1 +GGML_OPENVINO_STATEFUL_EXECUTION=1 GGML_OPENVINO_DEVICE=GPU ./build/ReleaseOV/bin/llama-bench -m ~/models/Llama-3.2-1B-Instruct-Q4_K_M.gguf -fa 1 # NPU: keep context small to avoid failures from very large model context windows. export GGML_OPENVINO_DEVICE=NPU -./build/ReleaseOV/bin/llama-cli -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf -c 512 +./build/ReleaseOV/bin/llama-cli -m ~/models/Llama-3.2-1B-Instruct-Q4_K_M.gguf -c 512 # Windows Command Line set GGML_OPENVINO_DEVICE=GPU -# Enable stateful execution with GPU device to avoid known stateless execution failures. +# Optional: enable stateful execution for improved GPU performance (recommended). set GGML_OPENVINO_STATEFUL_EXECUTION=1 # Windows PowerShell $env:GGML_OPENVINO_DEVICE = "GPU" $env:GGML_OPENVINO_STATEFUL_EXECUTION = "1" # To run llama-simple -build\ReleaseOV\bin\llama-simple.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf" -n 50 "The story of AI is " +build\ReleaseOV\bin\llama-simple.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_K_M.gguf" -n 50 "The story of AI is " # To run in chat mode: -build\ReleaseOV\bin\llama-cli.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf" -c 1024 +build\ReleaseOV\bin\llama-cli.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_K_M.gguf" -c 1024 # To run llama-bench, -fa 1 is needed -build\ReleaseOV\bin\llama-bench.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf" -fa 1 +build\ReleaseOV\bin\llama-bench.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_K_M.gguf" -fa 1 # NPU: keep context small to avoid failures from very large model context windows. # Windows Command Line set GGML_OPENVINO_DEVICE=NPU # Windows PowerShell $env:GGML_OPENVINO_DEVICE = "NPU" -build\ReleaseOV\bin\llama-cli.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf" -c 512 +build\ReleaseOV\bin\llama-cli.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_K_M.gguf" -c 512 ``` > [!NOTE] > On systems with multiple GPUs, use `GPU.0` or `GPU.1` to explicitly target specific GPU. See [OpenVINO GPU Device](https://docs.openvino.ai/2026/openvino-workflow/running-inference/inference-devices-and-modes/gpu-device.html) for more details. -### Known Issues and Current Workarounds - -- GPU stateless execution is currently affected by a known issue. - - Workaround: set `GGML_OPENVINO_STATEFUL_EXECUTION=1` when using GPU device. -- NPU failures can happen when context size is too large. Recent llama.cpp behavior may resolve context size to the model training context (for example, 131072 for Llama 3.2 1B), which is too large for current NPU usage and can also stress laptop CPU/GPU on larger models. To inspect the selected context size, run `llama-cli` or `llama-server` with `-lv 3`. - - Workaround: explicitly set context size, for ex. `-c 1024` for NPU runs. Performance will be better with lower context size. -- Additional NPU limitations: - - Model caching is not yet supported. - - `llama-server -np > 1` (multiple parallel sequences) is not supported. - - `llama-perplexity` is only supported with `-b 512` or smaller. -- `--context-shift` with `llama-cli` is currently not supported with OpenVINO backend across CPU, GPU, and NPU devices. -- Encoder models (embedding, reranking) are not supported with the current OpenVINO backend implementation. -- `-fa 1` is required when running llama-bench with the OpenVINO backend. - - `GGML_OPENVINO_STATEFUL_EXECUTION=1 GGML_OPENVINO_DEVICE=GPU ./llama-bench -fa 1` -- `llama-server` with OpenVINO backend supports only one chat session/thread, when `GGML_OPENVINO_STATEFUL_EXECUTION=1` is enabled. - -> [!NOTE] -> The OpenVINO backend is actively under development. Fixes are underway, and this document will continue to be updated as issues are resolved. - - -### Docker Build +### 5. Docker Build You can build and run llama.cpp with OpenVINO backend using Docker. @@ -272,7 +640,7 @@ docker build --target=light -t llama-openvino:light -f .devops/openvino.Dockerfi docker build --target=server -t llama-openvino:server -f .devops/openvino.Dockerfile . # If you are behind a proxy: -docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy --target=light -t llama-openvino:light -f .devops/openvino.Dockerfile . +docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy --target=server -t llama-openvino:server -f .devops/openvino.Dockerfile . ``` Run llama.cpp with OpenVINO backend Docker container. @@ -281,19 +649,19 @@ Save sample models in `~/models` as [shown above](#3-download-sample-model). It ```bash # Run Docker container -docker run --rm -it -v ~/models:/models llama-openvino:light --no-warmup -c 1024 -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf +docker run --rm -it -v ~/models:/models llama-openvino:light --no-warmup -c 1024 -m /models/Llama-3.2-1B-Instruct-Q4_K_M.gguf # With Intel GPU access (iGPU or dGPU) docker run --rm -it -v ~/models:/models \ --device=/dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) \ --env=GGML_OPENVINO_DEVICE=GPU --env=GGML_OPENVINO_STATEFUL_EXECUTION=1 \ -llama-openvino:light --no-warmup -c 1024 -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf +llama-openvino:light --no-warmup -c 1024 -m /models/Llama-3.2-1B-Instruct-Q4_K_M.gguf # With Intel NPU access docker run --rm -it -v ~/models:/models \ --device=/dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) \ --env=GGML_OPENVINO_DEVICE=NPU \ -llama-openvino:light --no-warmup -c 1024 -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf +llama-openvino:light --no-warmup -c 1024 -m /models/Llama-3.2-1B-Instruct-Q4_K_M.gguf ``` Run Llama.cpp Server with OpenVINO Backend. @@ -301,17 +669,30 @@ Run Llama.cpp Server with OpenVINO Backend. > `llama-server` with OpenVINO backend supports only one chat session/thread, when `GGML_OPENVINO_STATEFUL_EXECUTION=1` is enabled. ```bash -# Run the Server Docker container -docker run --rm -it -p 8080:8080 -v ~/models:/models llama-openvino:server --no-warmup -m /models/Llama-3.2-1B-Instruct-Q4_0.gguf -c 1024 -# Or Using llama-server executable -./build/ReleaseOV/bin/llama-server -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf --port 8080 -c 1024 +# Run the llama-openvino:server Docker container (CPU) +docker run --rm -it -p 8080:8080 -v ~/models:/models llama-openvino:server --no-warmup -m /models/Llama-3.2-1B-Instruct-Q4_K_M.gguf -c 1024 --host 0.0.0.0 -# If you are behind a proxy, make sure to set NO_PROXY to avoid proxy for localhost -export NO_PROXY=localhost,127.0.0.1 +# Run the llama-openvino:server Docker container with Intel GPU access (iGPU or dGPU) +docker run --rm -it -v ~/models:/models \ +--device=/dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) \ +-p 8080:8080 --env=GGML_OPENVINO_DEVICE=GPU \ +llama-openvino:server --no-warmup -c 1024 -m /models/Llama-3.2-1B-Instruct-Q4_K_M.gguf --host 0.0.0.0 + +# Run the llama-openvino:server Docker container with Intel NPU access +docker run --rm -it -v ~/models:/models \ +--device=/dev/accel --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -u $(id -u):$(id -g) \ +-p 8080:8080 --env=GGML_OPENVINO_DEVICE=NPU \ +llama-openvino:server --no-warmup -c 1024 -m /models/Llama-3.2-1B-Instruct-Q4_K_M.gguf --host 0.0.0.0 + +# Or Using llama-server executable +./build/ReleaseOV/bin/llama-server -m ~/models/Llama-3.2-1B-Instruct-Q4_K_M.gguf --port 8080 -c 1024 # Option 1: Open your browser to http://localhost:8080 to access the web UI for the llama.cpp server. # Option 2: In a NEW terminal, test the server with curl +# If you are behind a proxy, make sure to set NO_PROXY to avoid proxy for localhost +export NO_PROXY=localhost,127.0.0.1 + # Test health endpoint curl -f http://localhost:8080/health @@ -320,24 +701,26 @@ curl -X POST "http://localhost:8080/v1/chat/completions" -H "Content-Type: appli -d '{"messages":[{"role":"user","content":"Write a poem about OpenVINO"}],"max_tokens":100}' | jq . ``` -## Runtime Configuration +## GGML OpenVINO Backend Runtime Configurations The OpenVINO backend can be configured using the following environment variables at runtime to control device selection, caching, debugging, and profiling behavior. - -### Configuration Options - -| Variable | Default | Description | -|-----------------------------------|------------|-------------------------------------------------------------------------------------------------------------| -| `GGML_OPENVINO_DEVICE` | `CPU` | Specify the target device (CPU, GPU, NPU). On systems with multiple GPUs, use `GPU.0` or `GPU.1` to explicitly target specific GPU. See [OpenVINO GPU Device](https://docs.openvino.ai/2026/openvino-workflow/running-inference/inference-devices-and-modes/gpu-device.html). When set to **NPU**, static compilation mode is enabled for optimal performance. | -| `GGML_OPENVINO_CACHE_DIR` | `not set` | Directory for OpenVINO model caching (recommended: `/tmp/ov_cache`). Enables model caching when set. **Not supported on NPU devices.** | -| `GGML_OPENVINO_PREFILL_CHUNK_SIZE`| `256` | Token chunk size for **NPU** prefill. | -| `GGML_OPENVINO_STATEFUL_EXECUTION`| `0` | Enable stateful KV cache on for better performance. Recommended on CPU, GPU. | -| `GGML_OPENVINO_PROFILING` | `0` | Enable execution-time profiling. | -| `GGML_OPENVINO_DUMP_CGRAPH` | `0` | Dump the GGML compute graph to `cgraph_ov.txt`. | -| `GGML_OPENVINO_DUMP_IR` | `0` | Serialize OpenVINO IR files with timestamps. | -| `GGML_OPENVINO_DEBUG_INPUT` | `0` | Enable input debugging and print input tensor info. | -| `GGML_OPENVINO_DEBUG_OUTPUT` | `0` | Enable output debugging and print output tensor info. | -| `GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS` | `0` | Print tensor address map once. | +Boolean flags follow a uniform convention: set to a **positive integer** (e.g. `1`) to enable; unset, empty, `0`, negative, or non-numeric values are treated as disabled. + +| Variable | Type | Default | Description | +|-----------------------------------|-----------|------------|-------------------------------------------------------------------------------------------------------------| +| `GGML_OPENVINO_DEVICE` | String | `CPU` | Specify the target device (CPU, GPU, NPU). On systems with multiple GPUs, use `GPU.0` or `GPU.1` to explicitly target specific GPU. See [OpenVINO GPU Device](https://docs.openvino.ai/2026/openvino-workflow/running-inference/inference-devices-and-modes/gpu-device.html). When set to **NPU**, static compilation mode is enabled for optimal performance. | +| `GGML_OPENVINO_CACHE_DIR` | String | `not set` | Directory for OpenVINO model caching (recommended: `/tmp/ov_cache`). Enables model caching when set. **Not supported on NPU devices.** | +| `GGML_OPENVINO_PREFILL_CHUNK_SIZE`| Integer | `256` | Token chunk size for **NPU** prefill (NPU-only; ignored on CPU/GPU). Must be a positive integer; otherwise the default is used. | +| `GGML_OPENVINO_STATEFUL_EXECUTION`| Boolean | `0` | Enable stateful KV cache for better performance. Recommended on CPU, GPU. | +| `GGML_OPENVINO_DISABLE_CACHE` | Boolean | `0` | Disable the in-process compiled-model / decoder cache (cache is on by default). Set to `1` to disable. | +| `GGML_OPENVINO_DISABLE_KV_SLICE` | Boolean | `0` | Disable the KV-cache input-tensor slicing optimization (slicing is on by default on CPU/GPU). Set to `1` to disable. | +| `GGML_OPENVINO_MANUAL_GQA_ATTN` | Boolean | device-based | Tri-state. When **unset**, manual GQA attention is enabled by default on `GPU` and disabled on other devices. Set to a positive integer to force-enable, or `0` to force-disable. | +| `GGML_OPENVINO_PROFILING` | Boolean | `0` | Enable execution-time profiling. | +| `GGML_OPENVINO_DUMP_CGRAPH` | Boolean | `0` | Dump the GGML compute graph to `cgraph_ov.txt`. | +| `GGML_OPENVINO_DUMP_IR` | Boolean | `0` | Serialize OpenVINO IR files with timestamps. | +| `GGML_OPENVINO_DEBUG_INPUT` | Boolean | `0` | Enable input debugging and print input tensor info. | +| `GGML_OPENVINO_DEBUG_OUTPUT` | Boolean | `0` | Enable output debugging and print output tensor info. | +| `GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS` | Boolean | `0` | Print tensor address map once. | > [!NOTE] >`GGML_OPENVINO_STATEFUL_EXECUTION` is an **Experimental** feature to allow stateful execution for managing the KV cache internally inside the OpenVINO model, improving performance on CPUs and GPUs. Stateful execution is not effective on NPUs, and not all models currently support this feature. This feature is experimental and has been validated only with the llama-simple, llama-cli, llama-bench, and llama-run applications and is recommended to enable for the best performance. Other applications, such as llama-server and llama-perplexity, are not yet supported. @@ -355,7 +738,7 @@ export GGML_OPENVINO_PROFILING=1 export GGML_OPENVINO_DEVICE=GPU export GGML_OPENVINO_STATEFUL_EXECUTION=1 -./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct-Q4_0.gguf -n 50 "The story of AI is " +./build/ReleaseOV/bin/llama-simple -m ~/models/Llama-3.2-1B-Instruct-Q4_K_M.gguf -n 50 "The story of AI is " # Windows Command Line set GGML_OPENVINO_CACHE_DIR=C:\tmp\ov_cache @@ -369,19 +752,39 @@ $env:GGML_OPENVINO_PROFILING = "1" $env:GGML_OPENVINO_DEVICE = "GPU" $env:GGML_OPENVINO_STATEFUL_EXECUTION = "1" -build\ReleaseOV\bin\llama-simple.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_0.gguf" -n 50 "The story of AI is " +build\ReleaseOV\bin\llama-simple.exe -m "C:\models\Llama-3.2-1B-Instruct-Q4_K_M.gguf" -n 50 "The story of AI is " ``` -## Llama.cpp Tools +## Known Limitations -The following tools work with the OpenVINO backend on CPU, GPU, NPU: -- llama-bench -- llama-cli -- llama-completion -- llama-perplexity -- llama-server -- llama-simple +**General (all devices)** + +- Llama.cpp OpenVINO backend currently supports a subset of GGML ops and text-only models. Unsupported ops or unsupported op shapes/cases fail during OpenVINO translation. +- Multimodal features (audio/image/video) are a work in progress. +- Limited Embedding and Reranking model support. +- Llama.cpp tool coverage across CPU/GPU/NPU is not uniform. + +**Tool-specific** + +- `llama-bench`: requires `-fa 1` (flash-attention). +- `llama-cli --context-shift`: stateless only (`GGML_OPENVINO_STATEFUL_EXECUTION=0`). In stateful mode the KV cache is owned by the OpenVINO model and cannot be shifted externally. +- `llama-server`: only one chat session/thread when `GGML_OPENVINO_STATEFUL_EXECUTION=1`. + +**GPU-specific** + +- `llama-server -np > 1`: concurrent requests are batched together, which may slightly reduce per-request throughput. + +**NPU-specific** + +- Default context resolves to the model's training context (e.g. 131072 for Llama 3.2 1B), which can OOM or fail or degrade performance on NPU. Inspect the resolved value with `-lv 3`. + - **Workaround:** Pass an explicit `-c `, e.g. `-c 1024`. +- NPU device uses a static graph with a fixed prefill chunk size (defaults to 256), configurable with `GGML_OPENVINO_PREFILL_CHUNK_SIZE`. Large prefill/batch settings may need tuning. +- `llama-server -np > 1` (multiple parallel sequences) is not supported. +- `llama-perplexity`: requires `-b 512` or smaller. + +> [!NOTE] +> The OpenVINO backend is actively under development. Fixes and improvements are underway, and this document will continue to be updated. ## Work in Progress diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md index 3ea94d9d788a..8b0b9a18691a 100644 --- a/docs/backend/SYCL.md +++ b/docs/backend/SYCL.md @@ -161,6 +161,64 @@ You could update your test result in it directly. Please refer to [Docker with SYCL](../docker.md#docker-with-sycl) for details. +## Quick Development WOW + +This chapter is for quick development & try with SYCL backend on Intel GPU. + +You need to install following sofeware before development: + - Intel GPU driver + - oneAPI package + - other development tools. + +Please refer to [Linux](#linux) or [Windows](#windows-1) for above installation and resolve the trouble in usage. There are the detailed guide. + +- Linux + +``` +## build from source code +./examples/sycl/build.sh + +## run CONV_2D_DW unit test cases +./build/bin/test-backend-ops -b SYCL0 -o CONV_2D_DW + +## run all unit test cases +./build/bin/test-backend-ops -b SYCL0 + +## run with LLM on the first GPU +./examples/sycl/test.sh -mg 0 -m xxxx.gguf + +## run service with LLM on the first GPU +export ONEAPI_DEVICE_SELECTOR="level_zero:0" +./examples/sycl/start-svr.sh -m xxxx.gguf + +## update the docs/ops.md for new/update OPs +./examples/sycl/update-ops-doc.sh +``` + +- Windows + +``` +## build from source code +examples\sycl\win-build-sycl.bat + +## run CONV_2D_DW unit test cases +build\bin\test-backend-ops.exe -b SYCL0 -o CONV_2D_DW + +## run all unit test cases +build\bin\test-backend-ops.exe -b SYCL0 + +## run LLM on the first GPU +examples\sycl\win-test.bat -mg 0 -m xxxx.gguf + +## run service with LLM on the first GPU +set ONEAPI_DEVICE_SELECTOR="level_zero:0" +examples\sycl\win-start-svr.bat -m xxxx.gguf + +## update the docs/ops.md for new/update OPs +examples\sycl\win-update-ops-doc.bat +``` + + ## Linux ### I. Setup Environment @@ -253,6 +311,7 @@ When targeting an intel GPU, the user should expect one or more devices among th #### Intel GPU ```sh +# Uses FP32, consider using FP16 for better performance in most cases ./examples/sycl/build.sh ``` @@ -262,12 +321,12 @@ or # Export relevant ENV variables source /opt/intel/oneapi/setvars.sh -# Option 1: Use FP32 (recommended for better performance in most cases) -cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx - -# Option 2: Use FP16 +# Option 1: Use FP16 (recommended for better performance in most cases) cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON +# Option 2: Use FP32 +cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx + # build all binary cmake --build build --config Release -j -v ``` @@ -354,6 +413,15 @@ In two device selection modes, the default SYCL backend is level_zero, you can c |------------------|----------------------------------------| | Single device | --split-mode none --main-gpu DEVICE_ID | | Multiple devices | --split-mode layer (default) | +| Multiple devices | --split-mode tensor (tensor parallelism) | + +`--split-mode tensor` (tensor parallelism) shards each layer across the selected +GPUs. It requires flash attention, which is auto-enabled when `--flash-attn` is +left at its default `auto`, so `--split-mode tensor` works out of the box. +Passing `--flash-attn off` together with `--split-mode tensor` is rejected at +context creation. The default `f16` KV cache is recommended. Tensor parallelism +is currently optimized for 2 GPUs; other device counts fall back to a generic +all-reduce. Examples: @@ -469,6 +537,7 @@ Choose one of following methods to build from source code. ##### Option 1: Script ```sh +# Uses FP32, consider using FP16 for better performance in most cases .\examples\sycl\win-build-sycl.bat ``` @@ -479,11 +548,11 @@ On the oneAPI command line window, step into the llama.cpp main directory and ru ``` @call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force -# Option 1: Use FP32 (recommended for better performance in most cases) -cmake -B build -G "Ninja" -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release +# Option 1: Use FP16 (recommended for better performance in most cases) +cmake -B build -G "Ninja" -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON -# Option 2: Or FP16 -cmake -B build -G "Ninja" -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON +# Option 2: Or FP32 +cmake -B build -G "Ninja" -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release cmake --build build --config Release -j ``` @@ -491,10 +560,10 @@ cmake --build build --config Release -j Or, use CMake presets to build: ```sh -cmake --preset x64-windows-sycl-release +cmake -DGGML_SYCL_F16=ON --preset x64-windows-sycl-release cmake --build build-x64-windows-sycl-release -j --target llama-completion -cmake -DGGML_SYCL_F16=ON --preset x64-windows-sycl-release +cmake --preset x64-windows-sycl-release cmake --build build-x64-windows-sycl-release -j --target llama-completion cmake --preset x64-windows-sycl-debug @@ -655,6 +724,15 @@ In two device selection modes, the default SYCL backend is level_zero, you can c |------------------|----------------------------------------| | Single device | --split-mode none --main-gpu DEVICE_ID | | Multiple devices | --split-mode layer (default) | +| Multiple devices | --split-mode tensor (tensor parallelism) | + +`--split-mode tensor` (tensor parallelism) shards each layer across the selected +GPUs. It requires flash attention, which is auto-enabled when `--flash-attn` is +left at its default `auto`, so `--split-mode tensor` works out of the box. +Passing `--flash-attn off` together with `--split-mode tensor` is rejected at +context creation. The default `f16` KV cache is recommended. Tensor parallelism +is currently optimized for 2 GPUs; other device counts fall back to a generic +all-reduce. Examples: @@ -699,7 +777,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512 | GGML_SYCL_GRAPH | ON *(default)* \|OFF *(Optional)* | Enable build with [SYCL Graph extension](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc). | | GGML_SYCL_DNN | ON *(default)* \|OFF *(Optional)* | Enable build with oneDNN. | | GGML_SYCL_HOST_MEM_FALLBACK | ON *(default)* \|OFF *(Optional)* | Allow host memory fallback when device memory is full during quantized weight reorder. Enables inference to continue at reduced speed (reading over PCIe) instead of failing. Requires Linux kernel 6.8+. | -| GGML_SYCL_SUPPORT_LEVEL_ZERO | ON *(default)* \|OFF *(Optional)* | Enable Level Zero API for device memory allocation. Requires Level Zero headers/library at build time and Intel GPU driver (Level Zero runtime) at run time. Reduces system RAM usage during multi-GPU inference. | +| GGML_SYCL_SUPPORT_LEVEL_ZERO_API | ON *(default)* \|OFF *(Optional)* | Support to use Level Zero API for device memory allocation. Requires Level Zero headers/library at build time and Intel GPU driver (Level Zero runtime) at run time. Reduces system RAM usage during multi-GPU inference. SYCL backend always runs on Level Zero running time even if it's set as OFF (The SYCL api will be usage for memory allocation).| | CMAKE_C_COMPILER | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path. | | CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)* | Set `icpx/icx` compiler for SYCL code path. | @@ -710,14 +788,16 @@ use 1 SYCL GPUs: [0] with Max compute units:512 | Name | Value | Function | |-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------| | GGML_SYCL_DEBUG | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG | +| GGML_SYCL_DEV2DEV_MEMCPY | 0 (default) or 1 | Choose the SYCL or L0 API in dev2dev memory copy.
Value:
* 0: SYCL API (default)
* 1: L0 API -- L0 API is found to lead to abnormal crash in some case. This debug flag is used to check the issue.| | GGML_SYCL_ENABLE_FLASH_ATTN | 1 (default) or 0| Enable Flash-Attention. It can reduce memory usage. The performance impact depends on the LLM.| | GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features for Intel GPUs. (Recommended to 1 for Intel devices older than Gen 10) | | GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because SYCL Graph is still on development, no better performance. | -| GGML_SYCL_ENABLE_LEVEL_ZERO | 1 (default) or 0 | Use Level Zero API for device memory allocation instead of SYCL. Reduces system RAM usage on Intel dGPUs by avoiding DMA-buf/TTM host memory staging. Requires GGML_SYCL_SUPPORT_LEVEL_ZERO=ON at build time. | +| GGML_SYCL_USE_LEVEL_ZERO_API | 1 (default) or 0 | Use Level Zero API for device memory allocation instead of SYCL. Reduces system RAM usage on Intel dGPUs by avoiding DMA-buf/TTM host memory staging. Requires GGML_SYCL_SUPPORT_LEVEL_ZERO_API=ON at build time. SYCL backend always runs on Level Zero running time even if it's set as OFF (The SYCL api will be usage for memory allocation).| | GGML_SYCL_DISABLE_DNN | 0 (default) or 1 | Disable running computations through oneDNN and always use oneMKL. | | GGML_SYCL_ENABLE_VMM | 0 or 1 (default) | Enable the virtual-memory device pool. | | ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.
Recommended to use when --split-mode = layer | | UR_L0_ENABLE_RELAXED_ALLOCATION_LIMITS | 0 (default) or 1 | Allow SYCL/Unified Runtime Level Zero device allocations larger than 4 GiB. llama.cpp's direct Level Zero allocation path requests the relaxed maximum-size limit itself when GGML_SYCL_ENABLE_LEVEL_ZERO=1. | +| GGML_SYCL_USM_SYSTEM | 0 (default) or 1 | Enable experimental support for [USM system allocations](https://github.khronos.org/SYCL_Reference/iface/usm_basic_concept.html#system-allocations) for large GPU buffers. This requires enough host memory for model weights and caches, an Intel Xe2+ GPU such as BMG or newer and supported on Linux only, with CONFIG_DRM_XE_GPUSVM enabled. | ## Compile-time Flags @@ -728,6 +808,7 @@ Pass these via `CXXFLAGS` or add a one-off `#define` to enable a flag on the spo | DEBUG_SYCL_POOL | Enable device memory pool logging on teardown. Useful for profiling allocations. | | DEBUG_SYCL_MALLOC | Enable verbose per-call logging of device pool alloc/free operations. | + ## Design Rule - Open to all contributors. diff --git a/docs/backend/snapdragon/CMakeUserPresets.json b/docs/backend/snapdragon/CMakeUserPresets.json index d37100764f1d..848d735f1c5c 100644 --- a/docs/backend/snapdragon/CMakeUserPresets.json +++ b/docs/backend/snapdragon/CMakeUserPresets.json @@ -24,7 +24,6 @@ "GGML_LLAMAFILE": "OFF", "GGML_OPENCL": "ON", "GGML_HEXAGON": "ON", - "GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE": "128", "LLAMA_OPENSSL": "OFF" } }, @@ -47,7 +46,6 @@ "GGML_LLAMAFILE": "OFF", "GGML_OPENCL": "ON", "GGML_HEXAGON": "ON", - "GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE": "128", "LLAMA_OPENSSL": "OFF" } }, @@ -73,7 +71,6 @@ "GGML_LLAMAFILE": "OFF", "GGML_OPENCL": "OFF", "GGML_HEXAGON": "ON", - "GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE": "128", "LLAMA_OPENSSL": "OFF" } }, diff --git a/docs/install.md b/docs/install.md index 7200bf9b7b91..7198e61bf35b 100644 --- a/docs/install.md +++ b/docs/install.md @@ -1,12 +1,40 @@ # Install pre-built version of llama.cpp -| Install via | Windows | Mac | Linux | -|-------------|---------|-----|-------| +| Install via | Windows | Mac | Linux | +|-------------|---------|------|-------| +| conda-forge | ✅ | ✅ | ✅ | | Winget | ✅ | | | | Homebrew | | ✅ | ✅ | | MacPorts | | ✅ | | | Nix | | ✅ | ✅ | +## conda-forge (Windows, Mac and Linux) + +conda-forge provides builds for: + - CUDA (Windows and Linux) + - Vulkan (Windows and Linux) + - Apple Metal (macOS) + +```sh +conda install -c conda-forge llama-cpp +``` + +```sh +mamba install -c conda-forge llama-cpp +``` + +```sh +# Project-local installation +pixi add llama-cpp + +# Global installation +pixi global install llama-cpp +``` + +This distribution is managed on [`conda-forge/llama-cpp-feedstock`](https://github.com/conda-forge/llama.cpp-feedstock/). + +Shall you have any problems, please open an issue on [its issue tracker](https://github.com/conda-forge/llama.cpp-feedstock/issues). + ## Winget (Windows) ```sh diff --git a/docs/multimodal.md b/docs/multimodal.md index 33d1df33c3bb..76065aac4d12 100644 --- a/docs/multimodal.md +++ b/docs/multimodal.md @@ -1,10 +1,11 @@ # Multimodal llama.cpp supports multimodal input via `libmtmd`. Currently, there are 2 tools support this feature: -- [llama-mtmd-cli](../tools/mtmd/README.md) +- [llama-cli](../tools/cli/README.md) - [llama-server](../tools/server/README.md) via OpenAI-compatible `/chat/completions` API +- [llama-mtmd-cli](../tools/mtmd/README.md), for testing and development -Currently, we support **image** and **audio** input. Audio is highly experimental and may have reduced quality. +Currently, we support **image**, **audio** and **video** input. To enable it, you can use one of the 2 methods below: diff --git a/docs/ops.md b/docs/ops.md index 9ef5478de702..6fc8454c8e29 100644 --- a/docs/ops.md +++ b/docs/ops.md @@ -14,24 +14,25 @@ Legend: | Operation | BLAS | CANN | CPU | CUDA | MTL | OpenCL | SYCL | Vulkan | WebGPU | ZenDNN | zDNN | |-----------|------|------|------|------|------|------|------|------|------|------|------| -| ABS | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ | +| ABS | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | | ACC | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ | | ADD | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | -| ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | +| ADD1 | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | | ADD_ID | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | | ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | | ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | | ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ | -| CEIL | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ | -| CLAMP | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ❌ | ❌ | +| CEIL | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | +| CLAMP | ❌ | ✅ | ✅ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ | +| COL2IM_1D | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | | CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ | | CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | ❌ | -| CONV_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | -| CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | -| CONV_3D | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | +| CONV_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | +| CONV_2D_DW | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | +| CONV_3D | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | | CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | -| CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | -| COS | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ | +| CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | +| COS | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ | | COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | | CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ | | CROSS_ENTROPY_LOSS | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | @@ -41,82 +42,82 @@ Legend: | DIAG_MASK_INF | ❌ | ✅ | ✅ | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ | | DIV | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | | DUP | ❌ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ | ❌ | -| ELU | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ | -| EXP | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ | -| EXPM1 | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | +| ELU | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | +| EXP | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | +| EXPM1 | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | | FILL | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | | FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ | -| FLOOR | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ | -| GATED_DELTA_NET | ❌ | ❌ | ✅ | ❌ | 🟡 | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | +| FLOOR | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | +| GATED_DELTA_NET | ❌ | ❌ | ✅ | ❌ | 🟡 | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ | | GATED_LINEAR_ATTN | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | -| GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ | -| GEGLU_ERF | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ | -| GEGLU_QUICK | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ | -| GELU | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ | -| GELU_ERF | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ | -| GELU_QUICK | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ | +| GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | +| GEGLU_ERF | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | +| GEGLU_QUICK | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | +| GELU | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ | +| GELU_ERF | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ | +| GELU_QUICK | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ | | GET_ROWS | ❌ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | ✅ | ✅ | 🟡 | ❌ | ❌ | | GET_ROWS_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | | GROUP_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | -| HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ | -| HARDSWISH | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ | +| HARDSIGMOID | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | +| HARDSWISH | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | | IM2COL | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | | IM2COL_3D | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | | L2_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | | LEAKY_RELU | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ | -| LOG | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ | +| LOG | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | | MEAN | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | | MUL | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | | MUL_MAT | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | -| MUL_MAT_HADAMARD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | +| MUL_MAT_HADAMARD | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | | MUL_MAT_ID | ❌ | 🟡 | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ✅ | 🟡 | 🟡 | ❌ | -| NEG | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ | +| NEG | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | | NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ | | OPT_STEP_ADAMW | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | | OPT_STEP_SGD | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | | OUT_PROD | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | ❌ | ❌ | 🟡 | | PAD | ❌ | 🟡 | ✅ | 🟡 | 🟡 | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ | | PAD_REFLECT_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | -| POOL_1D | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | +| POOL_1D | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | | POOL_2D | ❌ | 🟡 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | -| REGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ | -| RELU | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ | -| REPEAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ | +| REGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | +| RELU | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ | +| REPEAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ | | REPEAT_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | | RMS_NORM | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | | RMS_NORM_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | | ROLL | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | | ROPE | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | | ROPE_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | -| ROUND | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ | +| ROUND | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | | RWKV_WKV6 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | | RWKV_WKV7 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | | SCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | | SET | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ | | SET_ROWS | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ | -| SGN | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ | -| SIGMOID | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ | -| SILU | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ | ❌ | +| SGN | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | +| SIGMOID | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ | +| SILU | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ | | SILU_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | -| SIN | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ | -| SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ | +| SIN | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ | +| SOFTPLUS | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | | SOFT_MAX | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | | SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ | ❌ | ❌ | | SOLVE_TRI | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | ✅ | ✅ | ❌ | ❌ | -| SQR | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ | -| SQRT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ❌ | ❌ | +| SQR | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ | +| SQRT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ | | SSM_CONV | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | | SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ | -| STEP | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | 🟡 | ✅ | ❌ | ❌ | +| STEP | ❌ | ✅ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | | SUB | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | | SUM | ❌ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | 🟡 | 🟡 | 🟡 | ❌ | ❌ | | SUM_ROWS | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ | ❌ | -| SWIGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ | -| SWIGLU_OAI | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ | -| TANH | ❌ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | 🟡 | ✅ | ❌ | ❌ | +| SWIGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | +| SWIGLU_OAI | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | +| TANH | ❌ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | | TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | | TOP_K | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ | | TRI | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | -| TRUNC | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | 🟡 | 🟡 | ✅ | ❌ | ❌ | +| TRUNC | ❌ | ❌ | ✅ | 🟡 | ✅ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | | UPSCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | ✅ | ❌ | ❌ | | XIELU | ❌ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | diff --git a/docs/ops/SYCL.csv b/docs/ops/SYCL.csv index afb4fa78740d..80a1c29c1232 100644 --- a/docs/ops/SYCL.csv +++ b/docs/ops/SYCL.csv @@ -27,20 +27,20 @@ "SYCL0","HARDSIGMOID","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL" "SYCL0","EXP","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL" "SYCL0","EXP","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL" -"SYCL0","EXPM1","type=f16,ne_a=[128,2,2,2],v=0","support","0","no","SYCL" -"SYCL0","EXPM1","type=f16,ne_a=[5,7,11,13],v=0","support","0","no","SYCL" +"SYCL0","EXPM1","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL" +"SYCL0","EXPM1","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL" "SYCL0","SOFTPLUS","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL" "SYCL0","SOFTPLUS","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL" "SYCL0","GELU_ERF","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL" "SYCL0","GELU_ERF","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL" -"SYCL0","FLOOR","type=f16,ne_a=[128,2,2,2],v=0","support","0","no","SYCL" -"SYCL0","FLOOR","type=f16,ne_a=[5,7,11,13],v=0","support","0","no","SYCL" +"SYCL0","FLOOR","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL" +"SYCL0","FLOOR","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL" "SYCL0","CEIL","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL" "SYCL0","CEIL","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL" -"SYCL0","ROUND","type=f16,ne_a=[128,2,2,2],v=0","support","0","no","SYCL" -"SYCL0","ROUND","type=f16,ne_a=[5,7,11,13],v=0","support","0","no","SYCL" -"SYCL0","TRUNC","type=f16,ne_a=[128,2,2,2],v=0","support","0","no","SYCL" -"SYCL0","TRUNC","type=f16,ne_a=[5,7,11,13],v=0","support","0","no","SYCL" +"SYCL0","ROUND","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL" +"SYCL0","ROUND","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL" +"SYCL0","TRUNC","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL" +"SYCL0","TRUNC","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL" "SYCL0","ABS","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL" "SYCL0","ABS","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL" "SYCL0","SGN","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL" @@ -69,20 +69,20 @@ "SYCL0","HARDSIGMOID","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL" "SYCL0","EXP","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL" "SYCL0","EXP","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL" -"SYCL0","EXPM1","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","SYCL" -"SYCL0","EXPM1","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","SYCL" +"SYCL0","EXPM1","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL" +"SYCL0","EXPM1","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL" "SYCL0","SOFTPLUS","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL" "SYCL0","SOFTPLUS","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL" "SYCL0","GELU_ERF","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL" "SYCL0","GELU_ERF","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL" -"SYCL0","FLOOR","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","SYCL" -"SYCL0","FLOOR","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","SYCL" +"SYCL0","FLOOR","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL" +"SYCL0","FLOOR","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL" "SYCL0","CEIL","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL" "SYCL0","CEIL","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL" -"SYCL0","ROUND","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","SYCL" -"SYCL0","ROUND","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","SYCL" -"SYCL0","TRUNC","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","SYCL" -"SYCL0","TRUNC","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","SYCL" +"SYCL0","ROUND","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL" +"SYCL0","ROUND","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL" +"SYCL0","TRUNC","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL" +"SYCL0","TRUNC","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL" "SYCL0","ABS","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL" "SYCL0","ABS","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL" "SYCL0","SGN","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL" @@ -111,8 +111,8 @@ "SYCL0","HARDSIGMOID","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL" "SYCL0","EXP","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL" "SYCL0","EXP","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL" -"SYCL0","EXPM1","type=f32,ne_a=[128,2,2,2],v=0","support","0","no","SYCL" -"SYCL0","EXPM1","type=f32,ne_a=[5,7,11,13],v=0","support","0","no","SYCL" +"SYCL0","EXPM1","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL" +"SYCL0","EXPM1","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL" "SYCL0","SOFTPLUS","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL" "SYCL0","SOFTPLUS","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL" "SYCL0","GELU_ERF","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL" @@ -153,20 +153,20 @@ "SYCL0","HARDSIGMOID","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL" "SYCL0","EXP","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL" "SYCL0","EXP","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL" -"SYCL0","EXPM1","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","SYCL" -"SYCL0","EXPM1","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","SYCL" +"SYCL0","EXPM1","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL" +"SYCL0","EXPM1","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL" "SYCL0","SOFTPLUS","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL" "SYCL0","SOFTPLUS","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL" "SYCL0","GELU_ERF","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL" "SYCL0","GELU_ERF","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL" -"SYCL0","FLOOR","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","SYCL" -"SYCL0","FLOOR","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","SYCL" +"SYCL0","FLOOR","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL" +"SYCL0","FLOOR","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL" "SYCL0","CEIL","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL" "SYCL0","CEIL","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL" -"SYCL0","ROUND","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","SYCL" -"SYCL0","ROUND","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","SYCL" -"SYCL0","TRUNC","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","SYCL" -"SYCL0","TRUNC","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","SYCL" +"SYCL0","ROUND","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL" +"SYCL0","ROUND","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL" +"SYCL0","TRUNC","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","SYCL" +"SYCL0","TRUNC","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","SYCL" "SYCL0","REGLU","type=f16,ne_a=[128,2,2,2],v=0,swapped=0","support","1","yes","SYCL" "SYCL0","REGLU","type=f16,ne_a=[5,7,11,13],v=0,swapped=0","support","1","yes","SYCL" "SYCL0","REGLU","type=f16,ne_a=[128,2,2,2],v=0,swapped=1","support","1","yes","SYCL" @@ -582,42 +582,42 @@ "SYCL0","SET_ROWS","type=q8_0,type_idx=i64,ne=[256,5,7,3],nr23=[1,1],r=1,v=1","support","1","yes","SYCL" "SYCL0","SET_ROWS","type=q8_0,type_idx=i64,ne=[256,11,1,7],nr23=[2,3],r=7,v=1","support","1","yes","SYCL" "SYCL0","SET_ROWS","type=q8_0,type_idx=i64,ne=[96,3,7,1],nr23=[2,3],r=2,v=1","support","1","yes","SYCL" -"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[256,5,1,3],nr23=[1,1],r=1,v=0","support","0","no","SYCL" -"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[256,11,1,1],nr23=[2,3],r=7,v=0","support","0","no","SYCL" -"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[384,3,1,1],nr23=[2,3],r=2,v=0","support","0","no","SYCL" -"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[256,5,1,3],nr23=[1,1],r=1,v=1","support","0","no","SYCL" -"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[256,11,1,1],nr23=[2,3],r=7,v=1","support","0","no","SYCL" -"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[384,3,1,1],nr23=[2,3],r=2,v=1","support","0","no","SYCL" -"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[256,5,7,3],nr23=[1,1],r=1,v=0","support","0","no","SYCL" -"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[256,11,1,7],nr23=[2,3],r=7,v=0","support","0","no","SYCL" -"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[384,3,7,1],nr23=[2,3],r=2,v=0","support","0","no","SYCL" -"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[256,5,7,3],nr23=[1,1],r=1,v=1","support","0","no","SYCL" -"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[256,11,1,7],nr23=[2,3],r=7,v=1","support","0","no","SYCL" -"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[384,3,7,1],nr23=[2,3],r=2,v=1","support","0","no","SYCL" -"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[256,5,1,3],nr23=[1,1],r=1,v=0","support","0","no","SYCL" -"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[256,11,1,1],nr23=[2,3],r=7,v=0","support","0","no","SYCL" -"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[96,3,1,1],nr23=[2,3],r=2,v=0","support","0","no","SYCL" -"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[256,5,1,3],nr23=[1,1],r=1,v=1","support","0","no","SYCL" -"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[256,11,1,1],nr23=[2,3],r=7,v=1","support","0","no","SYCL" -"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[96,3,1,1],nr23=[2,3],r=2,v=1","support","0","no","SYCL" -"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[256,5,7,3],nr23=[1,1],r=1,v=0","support","0","no","SYCL" -"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[256,11,1,7],nr23=[2,3],r=7,v=0","support","0","no","SYCL" -"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[96,3,7,1],nr23=[2,3],r=2,v=0","support","0","no","SYCL" -"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[256,5,7,3],nr23=[1,1],r=1,v=1","support","0","no","SYCL" -"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[256,11,1,7],nr23=[2,3],r=7,v=1","support","0","no","SYCL" -"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[96,3,7,1],nr23=[2,3],r=2,v=1","support","0","no","SYCL" -"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[256,5,1,3],nr23=[1,1],r=1,v=0","support","0","no","SYCL" -"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[256,11,1,1],nr23=[2,3],r=7,v=0","support","0","no","SYCL" -"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[192,3,1,1],nr23=[2,3],r=2,v=0","support","0","no","SYCL" -"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[256,5,1,3],nr23=[1,1],r=1,v=1","support","0","no","SYCL" -"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[256,11,1,1],nr23=[2,3],r=7,v=1","support","0","no","SYCL" -"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[192,3,1,1],nr23=[2,3],r=2,v=1","support","0","no","SYCL" -"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[256,5,7,3],nr23=[1,1],r=1,v=0","support","0","no","SYCL" -"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[256,11,1,7],nr23=[2,3],r=7,v=0","support","0","no","SYCL" -"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[192,3,7,1],nr23=[2,3],r=2,v=0","support","0","no","SYCL" -"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[256,5,7,3],nr23=[1,1],r=1,v=1","support","0","no","SYCL" -"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[256,11,1,7],nr23=[2,3],r=7,v=1","support","0","no","SYCL" -"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[192,3,7,1],nr23=[2,3],r=2,v=1","support","0","no","SYCL" +"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[256,5,1,3],nr23=[1,1],r=1,v=0","support","1","yes","SYCL" +"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[256,11,1,1],nr23=[2,3],r=7,v=0","support","1","yes","SYCL" +"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[384,3,1,1],nr23=[2,3],r=2,v=0","support","1","yes","SYCL" +"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[256,5,1,3],nr23=[1,1],r=1,v=1","support","1","yes","SYCL" +"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[256,11,1,1],nr23=[2,3],r=7,v=1","support","1","yes","SYCL" +"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[384,3,1,1],nr23=[2,3],r=2,v=1","support","1","yes","SYCL" +"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[256,5,7,3],nr23=[1,1],r=1,v=0","support","1","yes","SYCL" +"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[256,11,1,7],nr23=[2,3],r=7,v=0","support","1","yes","SYCL" +"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[384,3,7,1],nr23=[2,3],r=2,v=0","support","1","yes","SYCL" +"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[256,5,7,3],nr23=[1,1],r=1,v=1","support","1","yes","SYCL" +"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[256,11,1,7],nr23=[2,3],r=7,v=1","support","1","yes","SYCL" +"SYCL0","SET_ROWS","type=q1_0,type_idx=i64,ne=[384,3,7,1],nr23=[2,3],r=2,v=1","support","1","yes","SYCL" +"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[256,5,1,3],nr23=[1,1],r=1,v=0","support","1","yes","SYCL" +"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[256,11,1,1],nr23=[2,3],r=7,v=0","support","1","yes","SYCL" +"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[96,3,1,1],nr23=[2,3],r=2,v=0","support","1","yes","SYCL" +"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[256,5,1,3],nr23=[1,1],r=1,v=1","support","1","yes","SYCL" +"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[256,11,1,1],nr23=[2,3],r=7,v=1","support","1","yes","SYCL" +"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[96,3,1,1],nr23=[2,3],r=2,v=1","support","1","yes","SYCL" +"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[256,5,7,3],nr23=[1,1],r=1,v=0","support","1","yes","SYCL" +"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[256,11,1,7],nr23=[2,3],r=7,v=0","support","1","yes","SYCL" +"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[96,3,7,1],nr23=[2,3],r=2,v=0","support","1","yes","SYCL" +"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[256,5,7,3],nr23=[1,1],r=1,v=1","support","1","yes","SYCL" +"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[256,11,1,7],nr23=[2,3],r=7,v=1","support","1","yes","SYCL" +"SYCL0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[96,3,7,1],nr23=[2,3],r=2,v=1","support","1","yes","SYCL" +"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[256,5,1,3],nr23=[1,1],r=1,v=0","support","1","yes","SYCL" +"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[256,11,1,1],nr23=[2,3],r=7,v=0","support","1","yes","SYCL" +"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[192,3,1,1],nr23=[2,3],r=2,v=0","support","1","yes","SYCL" +"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[256,5,1,3],nr23=[1,1],r=1,v=1","support","1","yes","SYCL" +"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[256,11,1,1],nr23=[2,3],r=7,v=1","support","1","yes","SYCL" +"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[192,3,1,1],nr23=[2,3],r=2,v=1","support","1","yes","SYCL" +"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[256,5,7,3],nr23=[1,1],r=1,v=0","support","1","yes","SYCL" +"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[256,11,1,7],nr23=[2,3],r=7,v=0","support","1","yes","SYCL" +"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[192,3,7,1],nr23=[2,3],r=2,v=0","support","1","yes","SYCL" +"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[256,5,7,3],nr23=[1,1],r=1,v=1","support","1","yes","SYCL" +"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[256,11,1,7],nr23=[2,3],r=7,v=1","support","1","yes","SYCL" +"SYCL0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[192,3,7,1],nr23=[2,3],r=2,v=1","support","1","yes","SYCL" "SYCL0","SET_ROWS","type=q2_K,type_idx=i64,ne=[256,5,1,3],nr23=[1,1],r=1,v=0","support","0","no","SYCL" "SYCL0","SET_ROWS","type=q2_K,type_idx=i64,ne=[256,11,1,1],nr23=[2,3],r=7,v=0","support","0","no","SYCL" "SYCL0","SET_ROWS","type=q2_K,type_idx=i64,ne=[768,3,1,1],nr23=[2,3],r=2,v=0","support","0","no","SYCL" @@ -914,57 +914,58 @@ "SYCL0","POOL_2D","pool_type=max,type_input=f32,ne_input=[10,10,3,1],k0=3,k1=3,s0=2,s1=2,p0=0,p1=1","support","1","yes","SYCL" "SYCL0","POOL_2D","pool_type=max,type_input=f32,ne_input=[10,10,3,1],k0=3,k1=3,s0=2,s1=2,p0=1,p1=0","support","1","yes","SYCL" "SYCL0","POOL_2D","pool_type=max,type_input=f32,ne_input=[10,10,3,1],k0=3,k1=3,s0=2,s1=2,p0=1,p1=1","support","1","yes","SYCL" -"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[10,3,2,1],k0=1,s0=1,p0=0","support","0","no","SYCL" -"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[11,1,3,2],k0=1,s0=1,p0=0","support","0","no","SYCL" -"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[128,2,1,3],k0=1,s0=1,p0=0","support","0","no","SYCL" -"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[10,3,2,1],k0=1,s0=1,p0=1","support","0","no","SYCL" -"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[11,1,3,2],k0=1,s0=1,p0=1","support","0","no","SYCL" -"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[128,2,1,3],k0=1,s0=1,p0=1","support","0","no","SYCL" -"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[10,3,2,1],k0=1,s0=2,p0=0","support","0","no","SYCL" -"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[11,1,3,2],k0=1,s0=2,p0=0","support","0","no","SYCL" -"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[128,2,1,3],k0=1,s0=2,p0=0","support","0","no","SYCL" -"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[10,3,2,1],k0=1,s0=2,p0=1","support","0","no","SYCL" -"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[11,1,3,2],k0=1,s0=2,p0=1","support","0","no","SYCL" -"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[128,2,1,3],k0=1,s0=2,p0=1","support","0","no","SYCL" -"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[10,3,2,1],k0=3,s0=1,p0=0","support","0","no","SYCL" -"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[11,1,3,2],k0=3,s0=1,p0=0","support","0","no","SYCL" -"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[128,2,1,3],k0=3,s0=1,p0=0","support","0","no","SYCL" -"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[10,3,2,1],k0=3,s0=1,p0=1","support","0","no","SYCL" -"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[11,1,3,2],k0=3,s0=1,p0=1","support","0","no","SYCL" -"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[128,2,1,3],k0=3,s0=1,p0=1","support","0","no","SYCL" -"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[10,3,2,1],k0=3,s0=2,p0=0","support","0","no","SYCL" -"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[11,1,3,2],k0=3,s0=2,p0=0","support","0","no","SYCL" -"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[128,2,1,3],k0=3,s0=2,p0=0","support","0","no","SYCL" -"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[10,3,2,1],k0=3,s0=2,p0=1","support","0","no","SYCL" -"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[11,1,3,2],k0=3,s0=2,p0=1","support","0","no","SYCL" -"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[128,2,1,3],k0=3,s0=2,p0=1","support","0","no","SYCL" -"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[10,3,2,1],k0=1,s0=1,p0=0","support","0","no","SYCL" -"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[11,1,3,2],k0=1,s0=1,p0=0","support","0","no","SYCL" -"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[128,2,1,3],k0=1,s0=1,p0=0","support","0","no","SYCL" -"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[10,3,2,1],k0=1,s0=1,p0=1","support","0","no","SYCL" -"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[11,1,3,2],k0=1,s0=1,p0=1","support","0","no","SYCL" -"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[128,2,1,3],k0=1,s0=1,p0=1","support","0","no","SYCL" -"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[10,3,2,1],k0=1,s0=2,p0=0","support","0","no","SYCL" -"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[11,1,3,2],k0=1,s0=2,p0=0","support","0","no","SYCL" -"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[128,2,1,3],k0=1,s0=2,p0=0","support","0","no","SYCL" -"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[10,3,2,1],k0=1,s0=2,p0=1","support","0","no","SYCL" -"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[11,1,3,2],k0=1,s0=2,p0=1","support","0","no","SYCL" -"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[128,2,1,3],k0=1,s0=2,p0=1","support","0","no","SYCL" -"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[10,3,2,1],k0=3,s0=1,p0=0","support","0","no","SYCL" -"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[11,1,3,2],k0=3,s0=1,p0=0","support","0","no","SYCL" -"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[128,2,1,3],k0=3,s0=1,p0=0","support","0","no","SYCL" -"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[10,3,2,1],k0=3,s0=1,p0=1","support","0","no","SYCL" -"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[11,1,3,2],k0=3,s0=1,p0=1","support","0","no","SYCL" -"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[128,2,1,3],k0=3,s0=1,p0=1","support","0","no","SYCL" -"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[10,3,2,1],k0=3,s0=2,p0=0","support","0","no","SYCL" -"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[11,1,3,2],k0=3,s0=2,p0=0","support","0","no","SYCL" -"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[128,2,1,3],k0=3,s0=2,p0=0","support","0","no","SYCL" -"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[10,3,2,1],k0=3,s0=2,p0=1","support","0","no","SYCL" -"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[11,1,3,2],k0=3,s0=2,p0=1","support","0","no","SYCL" -"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[128,2,1,3],k0=3,s0=2,p0=1","support","0","no","SYCL" +"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[10,3,2,1],k0=1,s0=1,p0=0","support","1","yes","SYCL" +"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[11,1,3,2],k0=1,s0=1,p0=0","support","1","yes","SYCL" +"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[128,2,1,3],k0=1,s0=1,p0=0","support","1","yes","SYCL" +"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[10,3,2,1],k0=1,s0=1,p0=1","support","1","yes","SYCL" +"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[11,1,3,2],k0=1,s0=1,p0=1","support","1","yes","SYCL" +"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[128,2,1,3],k0=1,s0=1,p0=1","support","1","yes","SYCL" +"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[10,3,2,1],k0=1,s0=2,p0=0","support","1","yes","SYCL" +"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[11,1,3,2],k0=1,s0=2,p0=0","support","1","yes","SYCL" +"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[128,2,1,3],k0=1,s0=2,p0=0","support","1","yes","SYCL" +"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[10,3,2,1],k0=1,s0=2,p0=1","support","1","yes","SYCL" +"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[11,1,3,2],k0=1,s0=2,p0=1","support","1","yes","SYCL" +"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[128,2,1,3],k0=1,s0=2,p0=1","support","1","yes","SYCL" +"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[10,3,2,1],k0=3,s0=1,p0=0","support","1","yes","SYCL" +"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[11,1,3,2],k0=3,s0=1,p0=0","support","1","yes","SYCL" +"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[128,2,1,3],k0=3,s0=1,p0=0","support","1","yes","SYCL" +"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[10,3,2,1],k0=3,s0=1,p0=1","support","1","yes","SYCL" +"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[11,1,3,2],k0=3,s0=1,p0=1","support","1","yes","SYCL" +"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[128,2,1,3],k0=3,s0=1,p0=1","support","1","yes","SYCL" +"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[10,3,2,1],k0=3,s0=2,p0=0","support","1","yes","SYCL" +"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[11,1,3,2],k0=3,s0=2,p0=0","support","1","yes","SYCL" +"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[128,2,1,3],k0=3,s0=2,p0=0","support","1","yes","SYCL" +"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[10,3,2,1],k0=3,s0=2,p0=1","support","1","yes","SYCL" +"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[11,1,3,2],k0=3,s0=2,p0=1","support","1","yes","SYCL" +"SYCL0","POOL_1D","pool_type=avg,type_input=f32,ne_input=[128,2,1,3],k0=3,s0=2,p0=1","support","1","yes","SYCL" +"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[10,3,2,1],k0=1,s0=1,p0=0","support","1","yes","SYCL" +"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[11,1,3,2],k0=1,s0=1,p0=0","support","1","yes","SYCL" +"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[128,2,1,3],k0=1,s0=1,p0=0","support","1","yes","SYCL" +"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[10,3,2,1],k0=1,s0=1,p0=1","support","1","yes","SYCL" +"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[11,1,3,2],k0=1,s0=1,p0=1","support","1","yes","SYCL" +"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[128,2,1,3],k0=1,s0=1,p0=1","support","1","yes","SYCL" +"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[10,3,2,1],k0=1,s0=2,p0=0","support","1","yes","SYCL" +"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[11,1,3,2],k0=1,s0=2,p0=0","support","1","yes","SYCL" +"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[128,2,1,3],k0=1,s0=2,p0=0","support","1","yes","SYCL" +"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[10,3,2,1],k0=1,s0=2,p0=1","support","1","yes","SYCL" +"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[11,1,3,2],k0=1,s0=2,p0=1","support","1","yes","SYCL" +"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[128,2,1,3],k0=1,s0=2,p0=1","support","1","yes","SYCL" +"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[10,3,2,1],k0=3,s0=1,p0=0","support","1","yes","SYCL" +"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[11,1,3,2],k0=3,s0=1,p0=0","support","1","yes","SYCL" +"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[128,2,1,3],k0=3,s0=1,p0=0","support","1","yes","SYCL" +"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[10,3,2,1],k0=3,s0=1,p0=1","support","1","yes","SYCL" +"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[11,1,3,2],k0=3,s0=1,p0=1","support","1","yes","SYCL" +"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[128,2,1,3],k0=3,s0=1,p0=1","support","1","yes","SYCL" +"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[10,3,2,1],k0=3,s0=2,p0=0","support","1","yes","SYCL" +"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[11,1,3,2],k0=3,s0=2,p0=0","support","1","yes","SYCL" +"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[128,2,1,3],k0=3,s0=2,p0=0","support","1","yes","SYCL" +"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[10,3,2,1],k0=3,s0=2,p0=1","support","1","yes","SYCL" +"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[11,1,3,2],k0=3,s0=2,p0=1","support","1","yes","SYCL" +"SYCL0","POOL_1D","pool_type=max,type_input=f32,ne_input=[128,2,1,3],k0=3,s0=2,p0=1","support","1","yes","SYCL" "SYCL0","IM2COL","type_input=f32,type_kernel=f32,dst_type=f32,ne_input=[3000,128,1,1],ne_kernel=[3,128,1280,1],s0=1,s1=0,p0=1,p1=0,d0=1,d1=0,is_2D=0","support","1","yes","SYCL" "SYCL0","IM2COL","type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[3000,128,1,1],ne_kernel=[3,128,1280,1],s0=1,s1=0,p0=1,p1=0,d0=1,d1=0,is_2D=0","support","1","yes","SYCL" "SYCL0","IM2COL","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[3000,128,1,1],ne_kernel=[3,128,1280,1],s0=1,s1=0,p0=1,p1=0,d0=1,d1=0,is_2D=0","support","1","yes","SYCL" +"SYCL0","IM2COL","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[3000,384,1,1],ne_kernel=[3,384,384,1],s0=1,s1=0,p0=1,p1=0,d0=1,d1=0,is_2D=0","support","1","yes","SYCL" "SYCL0","IM2COL","type_input=f32,type_kernel=f32,dst_type=f32,ne_input=[20,2,2,1],ne_kernel=[3,2,2,1],s0=1,s1=0,p0=0,p1=0,d0=1,d1=0,is_2D=0","support","1","yes","SYCL" "SYCL0","IM2COL","type_input=f32,type_kernel=f32,dst_type=f32,ne_input=[20,2,2,1],ne_kernel=[3,2,2,1],s0=1,s1=0,p0=0,p1=0,d0=3,d1=0,is_2D=0","support","1","yes","SYCL" "SYCL0","IM2COL","type_input=f32,type_kernel=f32,dst_type=f32,ne_input=[20,2,2,1],ne_kernel=[3,2,2,1],s0=1,s1=0,p0=3,p1=0,d0=1,d1=0,is_2D=0","support","1","yes","SYCL" @@ -1050,6 +1051,8 @@ "SYCL0","IM2COL","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[12,12,2,2560],ne_kernel=[3,3,2,2560],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1","support","1","yes","SYCL" "SYCL0","IM2COL","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[5,5,1,32],ne_kernel=[3,4,1,32],s0=1,s1=1,p0=0,p1=0,d0=1,d1=1,is_2D=1","support","1","yes","SYCL" "SYCL0","IM2COL","type_input=f32,type_kernel=f32,dst_type=f32,ne_input=[2,2,1536,729],ne_kernel=[2,2,1536,4096],s0=1,s1=1,p0=0,p1=0,d0=1,d1=1,is_2D=1","support","1","yes","SYCL" +"SYCL0","IM2COL","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[128,128,1,2],ne_kernel=[32,33,1,2],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1","support","1","yes","SYCL" +"SYCL0","IM2COL","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[128,128,2,1],ne_kernel=[33,34,2,1],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1","support","1","yes","SYCL" "SYCL0","IM2COL_3D","type_input=f32,type_kernel=f32,dst_type=f32,ne_input=[10,10,10,9],ne_kernel=[3,3,3,1],IC=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,v=0","support","1","yes","SYCL" "SYCL0","IM2COL_3D","type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[10,10,10,9],ne_kernel=[3,3,3,1],IC=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,v=0","support","1","yes","SYCL" "SYCL0","IM2COL_3D","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[10,10,10,9],ne_kernel=[3,3,3,1],IC=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,v=0","support","1","yes","SYCL" @@ -3101,1836 +3104,1836 @@ "SYCL0","IM2COL_3D","type_input=f32,type_kernel=f32,dst_type=f32,ne_input=[20,20,10,3],ne_kernel=[3,3,3,3],IC=1,s0=3,s1=3,s2=3,p0=3,p1=3,p2=3,d0=3,d1=3,d2=3,v=1","support","1","yes","SYCL" "SYCL0","IM2COL_3D","type_input=f32,type_kernel=f32,dst_type=f32,ne_input=[20,20,10,3],ne_kernel=[3,3,3,3],IC=3,s0=3,s1=3,s2=3,p0=3,p1=3,p2=3,d0=3,d1=3,d2=3,v=0","support","1","yes","SYCL" "SYCL0","IM2COL_3D","type_input=f32,type_kernel=f32,dst_type=f32,ne_input=[20,20,10,3],ne_kernel=[3,3,3,3],IC=3,s0=3,s1=3,s2=3,p0=3,p1=3,p2=3,d0=3,d1=3,d2=3,v=1","support","1","yes","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,1,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,1,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,1,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,1,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,1,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,1,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,1,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,1,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,1,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,1,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,1,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,1,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,1,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,1,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,1,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,1,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,1,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,1,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,1,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,1,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,1,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,1,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,1,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,1,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,1,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,1,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,1,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,1,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,2,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,2,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,2,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,2,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,2,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,2,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,2,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,2,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,2,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,2,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,2,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,2,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,2,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,2,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,2,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,2,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,2,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,2,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,2,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,2,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,2,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,2,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,2,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,2,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,2,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,2,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,2,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,2,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,3,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,3,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,3,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,3,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,3,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,3,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,3,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,3,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,3,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,3,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,3,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,3,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,3,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,3,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,3,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,3,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,3,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,3,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,3,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,3,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,3,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,3,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,3,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,3,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,3,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,3,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,3,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,3,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,11,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,11,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,11,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,11,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,11,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,11,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,11,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,11,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,11,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,11,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,11,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,11,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,11,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,11,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,1,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,1,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,1,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,1,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,1,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,1,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,1,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,1,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,1,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,1,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,1,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,1,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,1,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,1,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,1,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,1,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,1,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,1,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,1,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,1,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,1,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,1,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,1,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,1,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,1,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,1,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,1,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,1,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,2,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,2,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,2,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,2,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,2,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,2,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,2,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,2,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,2,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,2,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,2,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,2,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,2,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,2,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,2,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,2,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,2,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,2,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,2,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,2,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,2,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,2,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,2,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,2,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,2,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,2,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,2,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,2,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,3,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,3,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,3,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,3,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,3,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,3,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,3,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,3,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,3,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,3,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,3,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,3,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,3,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,3,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,3,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,3,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,3,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,3,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,3,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,3,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,3,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,3,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,3,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,3,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,3,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,3,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,3,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,3,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,11,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,11,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,11,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,11,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,11,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,11,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,11,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,11,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,11,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,11,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,11,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,11,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,11,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,11,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,1,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,1,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,1,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,1,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,1,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,1,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,1,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,1,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,1,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,1,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,1,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,1,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,1,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,1,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,1,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,1,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,1,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,1,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,1,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,1,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,1,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,1,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,1,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,1,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,1,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,1,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,1,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,1,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,2,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,2,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,2,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,2,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,2,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,2,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,2,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,2,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,2,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,2,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,2,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,2,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,2,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,2,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,2,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,2,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,2,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,2,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,2,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,2,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,2,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,2,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,2,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,2,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,2,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,2,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,2,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,2,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,3,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,3,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,3,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,3,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,3,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,3,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,3,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,3,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,3,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,3,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,3,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,3,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,3,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,3,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,3,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,3,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,3,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,3,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,3,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,3,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,3,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,3,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,3,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,3,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,3,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,3,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,3,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,3,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,11,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,11,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,11,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,11,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,11,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,11,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,11,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,11,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,11,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,11,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,11,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,11,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,11,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,11,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,1,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,1,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,1,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,1,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,1,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,1,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,1,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,1,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,1,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,1,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,1,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,1,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,1,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,1,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,1,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,1,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,1,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,1,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,1,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,1,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,1,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,1,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,1,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,1,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,1,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,1,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,1,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,1,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,2,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,2,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,2,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,2,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,2,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,2,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,2,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,2,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,2,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,2,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,2,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,2,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,2,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,2,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,2,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,2,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,2,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,2,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,2,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,2,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,2,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,2,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,2,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,2,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,2,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,2,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,2,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,2,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,3,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,3,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,3,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,3,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,3,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,3,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,3,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,3,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,3,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,3,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,3,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,3,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,3,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,3,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,3,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,3,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,3,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,3,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,3,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,3,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,3,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,3,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,3,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,3,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,3,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,3,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,3,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,3,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,11,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,11,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,11,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,11,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,11,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,11,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,11,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,11,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,11,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,11,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,11,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,11,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,11,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,11,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,1,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,1,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,1,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,1,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,1,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,1,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,1,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,1,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,1,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,1,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,1,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,1,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,1,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,1,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,1,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,1,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,1,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,1,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,1,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,1,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,1,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,1,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,1,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,1,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,1,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,1,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,1,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,1,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,2,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,2,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,2,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,2,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,2,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,2,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,2,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,2,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,2,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,2,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,2,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,2,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,2,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,2,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,2,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,2,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,2,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,2,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,2,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,2,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,2,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,2,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,2,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,2,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,2,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,2,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,2,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,2,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,3,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,3,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,3,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,3,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,3,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,3,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,3,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,3,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,3,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,3,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,3,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,3,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,3,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,3,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,3,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,3,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,3,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,3,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,3,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,3,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,3,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,3,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,3,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,3,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,3,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,3,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,3,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,3,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,11,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,11,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,11,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,11,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,11,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,11,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,11,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,11,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,11,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,11,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,11,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,11,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,11,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,11,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,1,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,1,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,1,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,1,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,1,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,1,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,1,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,1,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,1,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,1,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,1,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,1,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,1,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,1,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,1,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,1,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,1,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,1,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,1,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,1,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,1,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,1,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,1,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,1,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,1,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,1,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,1,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,1,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,2,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,2,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,2,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,2,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,2,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,2,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,2,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,2,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,2,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,2,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,2,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,2,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,2,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,2,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,2,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,2,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,2,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,2,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,2,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,2,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,2,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,2,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,2,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,2,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,2,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,2,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,2,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,2,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,3,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,3,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,3,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,3,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,3,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,3,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,3,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,3,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,3,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,3,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,3,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,3,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,3,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,3,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,3,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,3,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,3,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,3,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,3,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,3,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,3,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,3,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,3,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,3,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,3,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,3,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,3,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,3,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,11,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,11,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,11,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,11,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,11,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,11,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,11,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,11,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,11,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,11,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,11,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,11,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,11,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,11,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,1,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,1,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,1,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,1,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,1,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,1,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,1,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,1,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,1,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,1,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,1,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,1,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,1,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,1,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,1,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,1,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,1,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,1,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,1,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,1,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,1,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,1,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,1,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,1,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,1,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,1,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,1,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,1,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,2,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,2,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,2,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,2,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,2,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,2,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,2,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,2,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,2,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,2,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,2,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,2,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,2,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,2,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,2,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,2,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,2,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,2,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,2,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,2,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,2,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,2,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,2,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,2,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,2,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,2,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,2,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,2,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,3,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,3,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,3,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,3,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,3,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,3,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,3,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,3,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,3,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,3,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,3,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,3,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,3,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,3,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,3,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,3,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,3,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,3,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,3,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,3,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,3,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,3,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,3,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,3,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,3,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,3,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,3,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,3,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,11,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,11,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,11,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,11,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,11,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,11,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,11,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,11,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,11,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,11,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,11,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,11,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,11,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,11,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,1,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,1,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,1,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,1,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,1,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,1,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,1,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,1,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,1,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,1,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,1,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,1,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,1,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,1,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,1,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,1,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,1,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,1,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,1,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,1,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,1,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,1,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,1,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,1,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,1,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,1,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,1,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,1,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,2,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,2,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,2,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,2,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,2,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,2,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,2,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,2,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,2,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,2,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,2,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,2,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,2,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,2,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,2,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,2,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,2,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,2,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,2,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,2,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,2,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,2,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,2,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,2,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,2,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,2,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,2,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,2,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,3,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,3,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,3,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,3,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,3,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,3,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,3,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,3,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,3,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,3,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,3,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,3,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,3,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,3,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,3,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,3,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,3,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,3,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,3,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,3,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,3,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,3,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,3,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,3,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,3,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,3,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,3,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,3,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,11,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,11,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,11,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,11,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,11,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,11,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,11,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,11,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,11,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,11,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,11,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,11,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,11,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,11,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,1,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,1,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,1,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,1,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,1,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,1,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,1,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,1,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,1,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,1,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,1,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,1,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,1,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,1,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,1,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,1,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,1,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,1,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,1,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,1,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,1,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,1,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,1,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,1,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,1,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,1,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,1,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,1,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,2,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,2,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,2,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,2,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,2,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,2,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,2,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,2,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,2,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,2,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,2,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,2,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,2,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,2,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,2,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,2,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,2,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,2,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,2,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,2,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,2,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,2,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,2,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,2,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,2,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,2,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,2,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,2,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,3,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,3,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,3,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,3,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,3,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,3,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,3,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,3,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,3,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,3,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,3,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,3,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,3,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,3,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,3,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,3,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,3,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,3,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,3,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,3,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,3,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,3,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,3,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,3,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,3,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,3,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,3,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,3,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,11,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,11,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,11,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,11,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,11,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,11,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,11,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,11,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,11,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,11,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,11,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,11,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,11,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,11,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,1,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,1,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,1,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,1,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,1,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,1,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,1,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,1,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,1,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,1,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,1,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,1,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,1,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,1,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,1,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,1,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,1,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,1,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,1,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,1,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,1,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,1,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,1,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,1,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,1,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,1,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,1,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,1,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,2,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,2,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,2,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,2,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,2,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,2,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,2,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,2,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,2,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,2,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,2,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,2,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,2,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,2,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,2,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,2,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,2,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,2,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,2,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,2,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,2,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,2,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,2,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,2,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,2,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,2,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,2,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,2,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,3,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,3,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,3,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,3,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,3,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,3,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,3,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,3,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,3,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,3,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,3,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,3,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,3,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,3,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,3,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,3,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,3,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,3,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,3,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,3,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,3,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,3,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,3,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,3,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,3,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,3,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,3,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,3,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,11,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,11,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,11,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,11,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,11,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,11,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,11,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,11,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,11,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,11,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,11,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,11,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,11,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,11,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,1,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,1,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,1,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,1,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,1,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,1,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,1,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,1,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,1,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,1,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,1,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,1,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,1,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,1,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,1,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,1,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,1,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,1,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,1,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,1,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,1,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,1,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,1,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,1,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,1,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,1,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,1,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,1,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,2,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,2,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,2,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,2,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,2,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,2,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,2,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,2,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,2,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,2,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,2,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,2,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,2,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,2,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,2,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,2,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,2,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,2,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,2,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,2,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,2,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,2,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,2,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,2,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,2,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,2,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,2,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,2,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,3,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,3,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,3,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,3,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,3,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,3,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,3,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,3,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,3,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,3,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,3,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,3,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,3,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,3,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,3,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,3,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,3,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,3,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,3,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,3,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,3,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,3,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,3,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,3,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,3,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,3,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,3,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,3,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,11,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,11,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,11,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,11,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,11,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,11,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,11,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,11,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,11,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,11,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,11,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,11,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,11,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,11,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,1,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,1,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,1,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,1,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,1,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,1,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,1,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,1,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,1,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,1,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,1,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,1,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,1,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,1,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,1,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,1,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,1,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,1,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,1,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,1,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,1,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,1,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,1,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,1,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,1,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,1,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,1,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,1,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,2,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,2,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,2,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,2,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,2,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,2,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,2,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,2,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,2,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,2,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,2,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,2,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,2,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,2,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,2,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,2,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,2,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,2,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,2,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,2,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,2,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,2,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,2,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,2,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,2,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,2,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,2,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,2,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,3,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,3,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,3,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,3,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,3,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,3,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,3,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,3,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,3,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,3,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,3,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,3,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,3,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,3,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,3,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,3,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,3,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,3,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,3,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,3,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,3,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,3,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,3,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,3,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,3,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,3,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,3,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,3,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,11,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,11,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,11,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,11,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,11,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,11,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,11,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,11,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,11,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,11,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,11,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,11,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,11,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,11,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,1,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,1,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,1,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,1,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,1,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,1,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,1,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,1,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,1,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,1,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,1,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,1,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,1,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,1,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,1,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,1,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,1,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,1,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,1,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,1,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,1,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,1,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,1,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,1,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,1,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,1,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,1,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,1,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,2,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,2,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,2,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,2,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,2,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,2,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,2,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,2,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,2,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,2,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,2,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,2,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,2,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,2,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,2,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,2,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,2,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,2,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,2,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,2,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,2,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,2,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,2,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,2,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,2,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,2,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,2,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,2,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,3,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,3,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,3,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,3,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,3,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,3,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,3,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,3,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,3,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,3,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,3,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,3,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,3,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,3,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,3,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,3,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,3,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,3,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,3,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,3,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,3,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,3,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,3,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,3,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,3,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,3,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,3,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,3,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,11,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,11,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,11,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,11,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,11,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,11,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,11,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,11,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,11,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,11,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,11,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,11,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,11,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,11,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,1,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,1,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,1,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,1,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,1,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,1,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,1,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,1,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,1,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,1,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,1,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,1,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,1,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,1,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,1,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,1,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,1,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,1,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,1,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,1,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,1,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,1,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,1,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,1,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,1,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,1,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,1,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,1,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,2,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,2,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,2,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,2,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,2,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,2,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,2,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,2,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,2,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,2,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,2,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,2,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,2,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,2,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,2,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,2,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,2,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,2,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,2,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,2,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,2,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,2,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,2,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,2,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,2,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,2,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,2,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,2,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,3,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,3,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,3,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,3,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,3,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,3,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,3,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,3,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,3,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,3,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,3,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,3,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,3,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,3,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,3,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,3,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,3,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,3,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,3,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,3,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,3,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,3,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,3,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,3,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,3,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,3,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,3,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,3,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,11,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,11,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,11,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,11,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,11,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,11,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,11,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,11,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,11,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,11,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,11,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,11,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,11,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,11,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,1,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,1,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,1,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,1,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,1,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,1,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,1,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,1,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,1,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,1,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,1,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,1,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,1,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,1,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,1,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,1,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,1,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,1,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,1,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,1,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,1,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,1,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,1,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,1,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,1,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,1,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,1,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,1,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,2,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,2,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,2,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,2,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,2,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,2,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,2,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,2,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,2,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,2,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,2,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,2,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,2,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,2,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,2,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,2,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,2,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,2,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,2,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,2,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,2,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,2,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,2,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,2,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,2,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,2,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,2,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,2,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,3,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,3,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,3,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,3,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,3,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,3,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,3,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,3,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,3,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,3,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,3,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,3,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,3,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,3,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,3,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,3,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,3,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,3,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,3,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,3,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,3,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,3,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,3,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,3,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,3,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,3,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,3,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,3,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,11,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,11,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,11,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,11,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,11,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,11,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,11,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,11,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,11,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,11,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,11,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,11,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,11,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,11,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,1,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,1,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,1,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,1,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,1,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,1,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,1,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,1,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,1,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,1,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,1,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,1,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,1,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,1,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,1,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,1,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,1,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,1,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,1,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,1,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,1,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,1,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,1,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,1,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,1,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,1,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,1,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,1,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,2,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,2,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,2,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,2,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,2,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,2,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,2,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,2,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,2,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,2,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,2,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,2,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,2,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,2,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,2,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,2,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,2,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,2,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,2,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,2,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,2,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,2,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,2,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,2,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,2,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,2,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,2,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,2,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,3,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,3,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,3,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,3,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,3,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,3,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,3,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,3,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,3,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,3,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,3,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,3,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,3,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,3,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,3,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,3,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,3,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,3,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,3,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,3,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,3,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,3,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,3,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,3,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,3,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,3,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,3,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,3,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,11,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,11,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,11,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,11,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,11,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,11,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,11,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,11,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,11,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,11,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,11,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,11,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,11,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,11,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D_DW","ne_input=[17,34,9,1],ne_kernel=[3,3,1,9],stride=1,padding=0,dilation=1,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D_DW","ne_input=[17,34,9,1],ne_kernel=[3,3,1,9],stride=1,padding=0,dilation=1,cwhn=1","support","0","no","SYCL" -"SYCL0","CONV_2D_DW","ne_input=[32,8,64,1],ne_kernel=[3,3,1,64],stride=2,padding=1,dilation=1,cwhn=0","support","0","no","SYCL" -"SYCL0","CONV_2D_DW","ne_input=[32,8,64,1],ne_kernel=[3,3,1,64],stride=2,padding=1,dilation=1,cwhn=1","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=4,ID=8,IH=8,IW=8,OC=8,KD=1,KH=1,KW=1,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","0","no","SYCL" -"SYCL0","CONV_3D","N=1,IC=4,ID=8,IH=8,IW=8,OC=8,KD=1,KH=1,KW=1,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","0","no","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,1,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,1,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,1,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,1,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,1,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,1,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,1,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,1,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,1,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,1,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,1,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,1,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,1,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,1,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,1,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,1,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,1,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,1,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,1,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,1,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,1,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,1,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,1,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,1,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,1,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,1,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,1,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,1,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,2,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,2,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,2,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,2,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,2,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,2,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,2,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,2,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,2,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,2,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,2,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,2,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,2,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,2,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,2,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,2,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,2,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,2,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,2,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,2,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,2,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,2,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,2,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,2,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,2,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,2,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,2,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,2,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,3,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,3,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,3,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,3,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,3,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,3,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,3,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,3,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,3,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,3,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,3,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,3,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,3,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,3,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,3,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,3,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,3,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,3,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,3,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,3,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,3,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,3,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,3,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,3,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,3,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,3,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,3,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,3,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,11,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,11,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,11,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,11,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,11,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,11,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,11,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,11,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,11,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,11,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,11,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,11,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,11,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,11,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,1,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,1,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,1,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,1,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,1,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,1,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,1,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,1,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,1,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,1,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,1,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,1,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,1,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,1,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,1,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,1,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,1,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,1,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,1,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,1,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,1,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,1,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,1,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,1,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,1,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,1,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,1,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,1,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,2,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,2,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,2,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,2,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,2,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,2,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,2,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,2,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,2,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,2,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,2,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,2,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,2,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,2,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,2,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,2,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,2,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,2,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,2,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,2,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,2,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,2,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,2,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,2,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,2,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,2,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,2,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,2,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,3,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,3,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,3,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,3,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,3,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,3,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,3,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,3,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,3,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,3,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,3,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,3,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,3,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,3,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,3,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,3,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,3,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,3,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,3,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,3,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,3,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,3,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,3,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,3,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,3,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,3,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,3,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,3,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,11,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,11,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,11,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,11,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,11,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,11,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,11,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,11,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,11,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,11,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,11,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,11,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,11,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,11,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,1,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,1,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,1,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,1,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,1,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,1,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,1,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,1,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,1,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,1,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,1,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,1,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,1,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,1,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,1,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,1,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,1,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,1,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,1,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,1,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,1,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,1,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,1,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,1,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,1,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,1,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,1,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,1,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,2,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,2,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,2,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,2,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,2,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,2,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,2,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,2,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,2,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,2,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,2,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,2,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,2,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,2,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,2,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,2,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,2,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,2,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,2,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,2,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,2,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,2,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,2,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,2,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,2,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,2,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,2,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,2,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,3,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,3,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,3,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,3,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,3,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,3,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,3,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,3,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,3,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,3,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,3,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,3,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,3,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,3,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,3,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,3,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,3,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,3,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,3,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,3,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,3,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,3,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,3,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,3,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,3,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,3,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,3,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,3,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,11,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,11,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,11,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,11,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,11,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,11,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,11,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,11,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,11,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,11,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,11,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,11,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,11,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,11,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,1,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,1,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,1,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,1,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,1,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,1,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,1,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,1,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,1,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,1,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,1,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,1,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,1,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,1,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,1,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,1,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,1,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,1,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,1,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,1,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,1,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,1,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,1,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,1,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,1,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,1,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,1,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,1,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,2,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,2,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,2,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,2,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,2,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,2,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,2,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,2,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,2,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,2,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,2,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,2,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,2,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,2,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,2,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,2,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,2,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,2,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,2,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,2,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,2,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,2,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,2,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,2,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,2,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,2,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,2,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,2,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,3,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,3,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,3,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,3,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,3,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,3,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,3,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,3,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,3,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,3,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,3,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,3,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,3,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,3,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,3,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,3,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,3,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,3,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,3,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,3,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,3,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,3,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,3,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,3,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,3,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,3,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,3,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,3,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,11,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,11,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,11,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,11,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,11,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,11,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,11,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,11,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,11,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,11,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,11,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,11,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,11,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,11,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,1,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,1,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,1,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,1,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,1,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,1,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,1,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,1,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,1,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,1,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,1,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,1,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,1,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,1,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,1,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,1,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,1,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,1,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,1,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,1,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,1,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,1,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,1,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,1,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,1,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,1,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,1,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,1,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,2,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,2,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,2,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,2,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,2,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,2,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,2,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,2,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,2,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,2,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,2,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,2,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,2,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,2,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,2,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,2,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,2,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,2,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,2,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,2,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,2,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,2,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,2,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,2,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,2,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,2,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,2,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,2,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,3,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,3,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,3,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,3,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,3,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,3,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,3,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,3,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,3,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,3,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,3,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,3,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,3,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,3,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,3,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,3,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,3,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,3,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,3,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,3,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,3,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,3,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,3,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,3,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,3,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,3,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,3,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,3,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,11,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,11,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,11,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,11,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,11,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,11,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,11,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,11,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,11,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,11,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,11,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,11,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,11,1,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,11,1,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,1,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,1,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,1,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,1,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,1,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,1,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,1,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,1,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,1,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,1,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,1,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,1,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,1,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,1,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,1,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,1,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,1,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,1,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,1,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,1,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,1,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,1,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,1,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,1,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,1,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,1,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,1,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,1,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,2,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,2,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,2,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,2,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,2,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,2,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,2,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,2,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,2,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,2,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,2,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,2,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,2,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,2,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,2,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,2,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,2,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,2,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,2,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,2,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,2,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,2,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,2,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,2,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,2,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,2,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,2,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,2,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,3,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,3,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,3,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,3,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,3,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,3,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,3,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,3,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,3,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,3,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,3,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,3,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,3,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,3,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,3,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,3,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,3,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,3,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,3,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,3,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,3,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,3,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,3,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,3,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,3,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,3,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,3,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,3,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,11,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,11,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,11,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,11,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,11,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,11,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,11,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,11,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,11,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,11,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,11,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,11,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,11,1,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,11,1,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,1,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,1,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,1,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,1,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,1,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,1,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,1,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,1,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,1,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,1,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,1,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,1,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,1,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,1,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,1,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,1,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,1,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,1,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,1,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,1,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,1,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,1,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,1,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,1,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,1,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,1,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,1,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,1,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,2,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,2,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,2,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,2,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,2,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,2,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,2,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,2,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,2,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,2,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,2,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,2,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,2,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,2,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,2,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,2,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,2,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,2,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,2,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,2,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,2,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,2,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,2,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,2,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,2,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,2,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,2,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,2,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,3,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,3,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,3,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,3,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,3,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,3,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,3,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,3,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,3,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,3,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,3,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,3,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,3,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,3,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,3,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,3,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,3,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,3,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,3,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,3,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,3,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,3,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,3,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,3,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,3,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,3,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,3,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,3,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,11,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,11,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,11,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,11,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,11,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,11,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,11,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,11,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,11,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,11,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,11,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,11,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,11,25,1],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,11,25,1],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,1,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,1,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,1,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,1,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,1,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,1,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,1,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,1,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,1,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,1,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,1,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,1,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,1,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,1,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,1,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,1,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,1,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,1,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,1,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,1,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,1,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,1,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,1,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,1,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,1,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,1,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,1,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,1,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,2,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,2,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,2,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,2,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,2,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,2,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,2,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,2,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,2,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,2,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,2,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,2,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,2,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,2,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,2,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,2,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,2,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,2,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,2,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,2,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,2,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,2,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,2,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,2,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,2,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,2,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,2,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,2,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,3,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,3,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,3,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,3,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,3,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,3,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,3,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,3,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,3,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,3,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,3,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,3,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,3,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,3,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,3,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,3,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,3,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,3,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,3,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,3,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,3,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,3,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,3,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,3,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,3,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,3,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,3,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,3,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,11,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,11,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,11,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,11,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,11,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,11,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,11,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,11,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,11,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,11,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,11,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,11,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,11,25,12],type_kernel=f32,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,11,25,12],type_kernel=f16,stride0=1,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,1,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,1,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,1,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,1,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,1,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,1,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,1,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,1,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,1,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,1,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,1,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,1,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,1,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,1,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,1,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,1,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,1,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,1,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,1,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,1,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,1,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,1,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,1,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,1,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,1,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,1,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,1,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,1,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,2,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,2,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,2,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,2,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,2,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,2,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,2,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,2,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,2,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,2,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,2,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,2,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,2,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,2,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,2,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,2,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,2,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,2,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,2,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,2,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,2,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,2,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,2,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,2,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,2,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,2,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,2,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,2,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,3,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,3,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,3,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,3,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,3,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,3,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,3,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,3,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,3,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,3,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,3,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,3,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,3,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,3,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,3,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,3,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,3,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,3,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,3,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,3,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,3,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,3,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,3,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,3,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,3,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,3,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,3,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,3,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,11,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,11,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,11,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,11,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,11,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,11,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,11,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,11,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,11,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,11,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,11,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,11,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,11,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,11,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,1,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,1,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,1,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,1,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,1,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,1,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,1,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,1,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,1,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,1,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,1,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,1,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,1,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,1,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,1,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,1,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,1,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,1,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,1,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,1,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,1,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,1,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,1,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,1,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,1,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,1,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,1,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,1,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,2,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,2,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,2,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,2,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,2,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,2,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,2,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,2,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,2,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,2,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,2,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,2,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,2,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,2,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,2,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,2,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,2,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,2,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,2,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,2,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,2,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,2,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,2,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,2,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,2,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,2,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,2,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,2,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,3,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,3,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,3,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,3,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,3,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,3,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,3,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,3,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,3,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,3,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,3,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,3,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,3,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,3,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,3,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,3,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,3,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,3,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,3,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,3,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,3,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,3,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,3,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,3,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,3,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,3,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,3,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,3,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,11,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,11,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,11,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,11,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,11,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,11,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,11,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,11,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,11,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,11,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,11,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,11,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,11,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,11,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,1,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,1,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,1,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,1,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,1,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,1,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,1,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,1,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,1,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,1,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,1,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,1,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,1,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,1,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,1,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,1,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,1,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,1,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,1,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,1,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,1,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,1,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,1,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,1,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,1,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,1,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,1,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,1,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,2,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,2,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,2,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,2,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,2,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,2,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,2,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,2,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,2,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,2,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,2,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,2,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,2,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,2,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,2,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,2,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,2,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,2,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,2,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,2,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,2,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,2,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,2,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,2,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,2,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,2,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,2,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,2,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,3,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,3,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,3,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,3,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,3,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,3,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,3,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,3,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,3,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,3,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,3,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,3,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,3,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,3,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,3,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,3,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,3,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,3,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,3,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,3,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,3,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,3,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,3,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,3,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,3,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,3,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,3,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,3,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,11,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,11,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,11,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,11,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,11,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,11,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,11,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,11,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,11,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,11,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,11,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,11,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,11,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,11,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,1,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,1,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,1,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,1,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,1,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,1,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,1,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,1,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,1,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,1,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,1,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,1,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,1,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,1,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,1,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,1,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,1,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,1,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,1,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,1,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,1,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,1,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,1,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,1,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,1,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,1,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,1,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,1,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,2,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,2,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,2,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,2,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,2,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,2,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,2,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,2,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,2,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,2,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,2,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,2,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,2,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,2,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,2,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,2,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,2,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,2,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,2,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,2,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,2,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,2,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,2,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,2,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,2,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,2,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,2,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,2,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,3,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,3,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,3,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,3,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,3,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,3,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,3,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,3,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,3,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,3,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,3,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,3,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,3,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,3,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,3,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,3,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,3,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,3,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,3,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,3,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,3,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,3,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,3,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,3,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,3,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,3,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,3,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,3,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,11,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,11,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,11,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,11,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,11,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,11,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,11,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,11,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,11,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,11,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,11,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,11,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,11,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,11,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=2,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,1,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,1,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,1,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,1,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,1,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,1,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,1,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,1,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,1,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,1,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,1,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,1,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,1,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,1,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,1,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,1,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,1,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,1,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,1,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,1,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,1,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,1,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,1,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,1,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,1,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,1,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,1,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,1,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,2,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,2,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,2,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,2,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,2,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,2,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,2,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,2,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,2,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,2,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,2,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,2,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,2,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,2,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,2,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,2,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,2,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,2,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,2,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,2,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,2,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,2,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,2,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,2,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,2,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,2,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,2,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,2,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,3,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,3,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,3,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,3,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,3,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,3,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,3,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,3,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,3,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,3,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,3,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,3,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,3,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,3,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,3,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,3,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,3,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,3,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,3,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,3,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,3,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,3,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,3,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,3,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,3,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,3,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,3,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,3,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,11,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,11,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,11,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,11,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,11,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,11,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,11,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,11,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,11,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,11,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,11,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,11,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,11,1,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,11,1,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,1,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,1,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,1,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,1,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,1,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,1,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,1,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,1,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,1,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,1,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,1,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,1,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,1,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,1,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,1,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,1,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,1,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,1,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,1,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,1,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,1,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,1,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,1,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,1,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,1,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,1,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,1,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,1,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,2,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,2,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,2,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,2,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,2,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,2,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,2,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,2,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,2,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,2,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,2,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,2,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,2,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,2,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,2,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,2,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,2,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,2,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,2,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,2,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,2,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,2,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,2,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,2,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,2,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,2,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,2,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,2,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,3,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[1,3,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,3,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[1,3,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,3,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,3,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,3,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,3,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,3,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[2,3,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,3,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[2,3,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,3,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,3,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,3,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,3,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,3,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,1,2],ne_kernel=[3,3,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,3,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[3,3,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,3,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,3,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,3,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,3,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,3,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,1,2],ne_kernel=[11,3,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,3,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,3,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,11,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[1,11,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,11,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[1,11,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,11,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[2,11,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,11,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[2,11,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,11,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,1,2],ne_kernel=[3,11,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,11,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[3,11,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,11,1,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,1,2],ne_kernel=[11,11,1,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,1,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,1,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,1,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,1,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,1,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,1,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,1,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,1,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,1,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,1,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,1,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,1,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,1,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,1,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,1,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,1,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,1,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,1,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,1,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,1,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,1,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,1,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,1,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,1,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,1,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,1,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,1,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,1,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,2,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,2,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,2,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,2,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,2,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,2,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,2,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,2,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,2,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,2,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,2,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,2,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,2,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,2,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,2,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,2,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,2,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,2,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,2,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,2,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,2,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,2,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,2,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,2,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,2,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,2,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,2,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,2,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,3,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,3,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,3,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,3,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,3,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,3,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,3,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,3,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,3,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,3,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,3,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,3,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,3,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,3,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,3,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,3,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,3,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,3,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,3,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,3,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,3,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,3,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,3,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,3,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,3,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,3,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,3,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,3,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,11,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,11,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,11,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,11,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,11,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,11,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,11,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,11,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,11,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,11,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,11,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,11,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,11,25,1],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,11,25,1],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,1,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,1,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,1,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,1,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,1,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,1,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,1,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,1,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,1,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,1,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,1,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,1,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,1,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,1,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,1,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,1,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,1,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,1,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,1,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,1,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,1,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,1,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,1,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,1,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,1,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,1,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,1,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,1,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,2,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,2,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,2,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,2,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,2,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,2,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,2,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,2,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,2,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,2,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,2,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,2,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,2,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,2,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,2,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,2,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,2,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,2,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,2,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,2,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,2,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,2,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,2,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,2,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,2,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,2,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,2,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,2,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,3,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[1,3,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,3,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[1,3,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,3,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,3,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,3,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,3,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,3,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[2,3,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,3,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[2,3,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,3,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,3,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,3,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,3,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,3,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,1,25,2],ne_kernel=[3,3,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,3,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[3,3,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,3,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,3,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,3,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,3,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,3,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,1,25,2],ne_kernel=[11,3,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,3,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,3,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,11,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[1,11,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,11,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[1,11,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,11,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[2,11,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,11,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[2,11,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,11,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[1,133,25,2],ne_kernel=[3,11,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,11,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[3,11,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,11,25,12],type_kernel=f32,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D","ne_input=[141,133,25,2],ne_kernel=[11,11,25,12],type_kernel=f16,stride0=3,stride1=5,padding0=5,padding1=5,dilation0=2,dilation1=4,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D_DW","ne_input=[17,34,9,1],ne_kernel=[3,3,1,9],stride=1,padding=0,dilation=1,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D_DW","ne_input=[17,34,9,1],ne_kernel=[3,3,1,9],stride=1,padding=0,dilation=1,cwhn=1","support","1","yes","SYCL" +"SYCL0","CONV_2D_DW","ne_input=[32,8,64,1],ne_kernel=[3,3,1,64],stride=2,padding=1,dilation=1,cwhn=0","support","1","yes","SYCL" +"SYCL0","CONV_2D_DW","ne_input=[32,8,64,1],ne_kernel=[3,3,1,64],stride=2,padding=1,dilation=1,cwhn=1","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=4,ID=8,IH=8,IW=8,OC=8,KD=1,KH=1,KW=1,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f32","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=1,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=1,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=0,p1=0,p2=0,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=3,KW=3,s0=2,s1=2,s2=2,p0=1,p1=1,p2=1,d0=2,d1=2,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=2,IC=3,ID=18,IH=22,IW=20,OC=4,KD=3,KH=1,KW=5,s0=2,s1=1,s2=1,p0=2,p1=0,p2=1,d0=1,d1=1,d2=2,type_kernel=f16","support","1","yes","SYCL" +"SYCL0","CONV_3D","N=1,IC=4,ID=8,IH=8,IW=8,OC=8,KD=1,KH=1,KW=1,s0=1,s1=1,s2=1,p0=0,p1=0,p2=0,d0=1,d1=1,d2=1,type_kernel=f16","support","1","yes","SYCL" "SYCL0","CONV_TRANSPOSE_1D","ne_input=[1,1,1,1],ne_kernel=[1,1,1,1],s0=1,p0=0,d0=1","support","1","yes","SYCL" "SYCL0","CONV_TRANSPOSE_1D","ne_input=[1,1,1,1],ne_kernel=[1,1,1,1],s0=2,p0=0,d0=1","support","1","yes","SYCL" "SYCL0","CONV_TRANSPOSE_1D","ne_input=[1,1,1,1],ne_kernel=[1,1,1,1],s0=3,p0=0,d0=1","support","1","yes","SYCL" @@ -5047,12 +5050,45 @@ "SYCL0","CONV_TRANSPOSE_1D","ne_input=[3,2,1,1],ne_kernel=[3,2,2,1],s0=1,p0=0,d0=1","support","1","yes","SYCL" "SYCL0","CONV_TRANSPOSE_1D","ne_input=[3,2,1,1],ne_kernel=[3,1,2,1],s0=1,p0=0,d0=1","support","1","yes","SYCL" "SYCL0","CONV_TRANSPOSE_1D","ne_input=[2,1,1,1],ne_kernel=[3,1,1,1],s0=1,p0=0,d0=1","support","1","yes","SYCL" -"SYCL0","CONV_TRANSPOSE_2D","kernel_type=f32,ne_input=[3,2,3,1],ne_kernel=[2,2,1,3],stride=1","support","0","no","SYCL" -"SYCL0","CONV_TRANSPOSE_2D","kernel_type=f32,ne_input=[10,10,9,1],ne_kernel=[3,3,1,9],stride=2","support","0","no","SYCL" -"SYCL0","CONV_TRANSPOSE_2D","kernel_type=f32,ne_input=[129,63,35,1],ne_kernel=[3,3,48,35],stride=1","support","0","no","SYCL" -"SYCL0","CONV_TRANSPOSE_2D","kernel_type=f16,ne_input=[3,2,3,1],ne_kernel=[2,2,1,3],stride=1","support","0","no","SYCL" -"SYCL0","CONV_TRANSPOSE_2D","kernel_type=f16,ne_input=[10,10,9,1],ne_kernel=[3,3,1,9],stride=2","support","0","no","SYCL" -"SYCL0","CONV_TRANSPOSE_2D","kernel_type=f16,ne_input=[129,63,35,1],ne_kernel=[3,3,48,35],stride=1","support","0","no","SYCL" +"SYCL0","COL2IM_1D","type=f32,K=16,OC=32,T_in=197,s0=8,p0=0","support","0","no","SYCL" +"SYCL0","COL2IM_1D","type=f32,K=4,OC=3,T_in=7,s0=2,p0=0","support","0","no","SYCL" +"SYCL0","COL2IM_1D","type=f32,K=1,OC=5,T_in=13,s0=1,p0=0","support","0","no","SYCL" +"SYCL0","COL2IM_1D","type=f32,K=6,OC=4,T_in=11,s0=3,p0=1","support","0","no","SYCL" +"SYCL0","COL2IM_1D","type=f32,K=2,OC=3,T_in=9,s0=3,p0=0","support","0","no","SYCL" +"SYCL0","COL2IM_1D","type=f32,K=5,OC=4,T_in=11,s0=2,p0=0","support","0","no","SYCL" +"SYCL0","COL2IM_1D","type=f32,K=8,OC=4,T_in=13,s0=4,p0=2","support","0","no","SYCL" +"SYCL0","COL2IM_1D","type=f32,K=4,OC=3,T_in=1,s0=2,p0=0","support","0","no","SYCL" +"SYCL0","COL2IM_1D","type=f32,K=16,OC=1,T_in=197,s0=8,p0=0","support","0","no","SYCL" +"SYCL0","COL2IM_1D","type=f32,K=1,OC=5,T_in=13,s0=3,p0=0","support","0","no","SYCL" +"SYCL0","COL2IM_1D","type=f32,K=8,OC=2,T_in=3,s0=2,p0=5","support","0","no","SYCL" +"SYCL0","COL2IM_1D","type=f16,K=16,OC=32,T_in=197,s0=8,p0=0","support","0","no","SYCL" +"SYCL0","COL2IM_1D","type=f16,K=4,OC=3,T_in=7,s0=2,p0=0","support","0","no","SYCL" +"SYCL0","COL2IM_1D","type=f16,K=1,OC=5,T_in=13,s0=1,p0=0","support","0","no","SYCL" +"SYCL0","COL2IM_1D","type=f16,K=6,OC=4,T_in=11,s0=3,p0=1","support","0","no","SYCL" +"SYCL0","COL2IM_1D","type=f16,K=2,OC=3,T_in=9,s0=3,p0=0","support","0","no","SYCL" +"SYCL0","COL2IM_1D","type=f16,K=5,OC=4,T_in=11,s0=2,p0=0","support","0","no","SYCL" +"SYCL0","COL2IM_1D","type=f16,K=8,OC=4,T_in=13,s0=4,p0=2","support","0","no","SYCL" +"SYCL0","COL2IM_1D","type=f16,K=4,OC=3,T_in=1,s0=2,p0=0","support","0","no","SYCL" +"SYCL0","COL2IM_1D","type=f16,K=16,OC=1,T_in=197,s0=8,p0=0","support","0","no","SYCL" +"SYCL0","COL2IM_1D","type=f16,K=1,OC=5,T_in=13,s0=3,p0=0","support","0","no","SYCL" +"SYCL0","COL2IM_1D","type=f16,K=8,OC=2,T_in=3,s0=2,p0=5","support","0","no","SYCL" +"SYCL0","COL2IM_1D","type=bf16,K=16,OC=32,T_in=197,s0=8,p0=0","support","0","no","SYCL" +"SYCL0","COL2IM_1D","type=bf16,K=4,OC=3,T_in=7,s0=2,p0=0","support","0","no","SYCL" +"SYCL0","COL2IM_1D","type=bf16,K=1,OC=5,T_in=13,s0=1,p0=0","support","0","no","SYCL" +"SYCL0","COL2IM_1D","type=bf16,K=6,OC=4,T_in=11,s0=3,p0=1","support","0","no","SYCL" +"SYCL0","COL2IM_1D","type=bf16,K=2,OC=3,T_in=9,s0=3,p0=0","support","0","no","SYCL" +"SYCL0","COL2IM_1D","type=bf16,K=5,OC=4,T_in=11,s0=2,p0=0","support","0","no","SYCL" +"SYCL0","COL2IM_1D","type=bf16,K=8,OC=4,T_in=13,s0=4,p0=2","support","0","no","SYCL" +"SYCL0","COL2IM_1D","type=bf16,K=4,OC=3,T_in=1,s0=2,p0=0","support","0","no","SYCL" +"SYCL0","COL2IM_1D","type=bf16,K=16,OC=1,T_in=197,s0=8,p0=0","support","0","no","SYCL" +"SYCL0","COL2IM_1D","type=bf16,K=1,OC=5,T_in=13,s0=3,p0=0","support","0","no","SYCL" +"SYCL0","COL2IM_1D","type=bf16,K=8,OC=2,T_in=3,s0=2,p0=5","support","0","no","SYCL" +"SYCL0","CONV_TRANSPOSE_2D","kernel_type=f32,ne_input=[3,2,3,1],ne_kernel=[2,2,1,3],stride=1","support","1","yes","SYCL" +"SYCL0","CONV_TRANSPOSE_2D","kernel_type=f32,ne_input=[10,10,9,1],ne_kernel=[3,3,1,9],stride=2","support","1","yes","SYCL" +"SYCL0","CONV_TRANSPOSE_2D","kernel_type=f32,ne_input=[129,63,35,1],ne_kernel=[3,3,48,35],stride=1","support","1","yes","SYCL" +"SYCL0","CONV_TRANSPOSE_2D","kernel_type=f16,ne_input=[3,2,3,1],ne_kernel=[2,2,1,3],stride=1","support","1","yes","SYCL" +"SYCL0","CONV_TRANSPOSE_2D","kernel_type=f16,ne_input=[10,10,9,1],ne_kernel=[3,3,1,9],stride=2","support","1","yes","SYCL" +"SYCL0","CONV_TRANSPOSE_2D","kernel_type=f16,ne_input=[129,63,35,1],ne_kernel=[3,3,48,35],stride=1","support","1","yes","SYCL" "SYCL0","COUNT_EQUAL","type=f32,ne=[4,500,1,1]","support","1","yes","SYCL" "SYCL0","COUNT_EQUAL","type=f32,ne=[4,5000,1,1]","support","1","yes","SYCL" "SYCL0","ARGMAX","type=f32,ne=[32,1,1,1]","support","1","yes","SYCL" @@ -5069,6 +5105,7 @@ "SYCL0","REPEAT","type=f32,ne=[10,5,4,1],nr=[1,1,1,2]","support","1","yes","SYCL" "SYCL0","REPEAT","type=i32,ne=[10,5,4,1],nr=[2,1,1,1]","support","1","yes","SYCL" "SYCL0","REPEAT","type=i16,ne=[10,5,4,1],nr=[1,1,1,2]","support","1","yes","SYCL" +"SYCL0","REPEAT","type=bf16,ne=[10,5,4,1],nr=[2,1,1,1]","support","1","yes","SYCL" "SYCL0","REPEAT","type=f32,ne=[10,5,4,3],nr=[1,1,1,1]","support","1","yes","SYCL" "SYCL0","REPEAT","type=f32,ne=[10,5,4,3],nr=[2,1,1,1]","support","1","yes","SYCL" "SYCL0","REPEAT","type=f32,ne=[10,5,4,3],nr=[1,2,1,1]","support","1","yes","SYCL" @@ -5076,6 +5113,7 @@ "SYCL0","REPEAT","type=f32,ne=[10,5,4,3],nr=[1,1,1,2]","support","1","yes","SYCL" "SYCL0","REPEAT","type=i32,ne=[10,5,4,3],nr=[2,1,1,1]","support","1","yes","SYCL" "SYCL0","REPEAT","type=i16,ne=[10,5,4,3],nr=[1,1,1,2]","support","1","yes","SYCL" +"SYCL0","REPEAT","type=bf16,ne=[10,5,4,3],nr=[2,1,1,1]","support","1","yes","SYCL" "SYCL0","REPEAT_BACK","type=f32,ne=[8,6,4,2],nr=[1,1,1,1],v=0","support","1","yes","SYCL" "SYCL0","REPEAT_BACK","type=f32,ne=[8,6,4,2],nr=[2,1,1,1],v=0","support","1","yes","SYCL" "SYCL0","REPEAT_BACK","type=f32,ne=[8,6,4,2],nr=[1,2,1,1],v=0","support","1","yes","SYCL" @@ -6185,6 +6223,7 @@ "SYCL0","MUL_MAT_HADAMARD","type_a=f32,type_b=f32,m=128,n=1,k=128,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","SYCL" "SYCL0","MUL_MAT_HADAMARD","type_a=f32,type_b=f32,m=64,n=1,k=64,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","SYCL" "SYCL0","MUL_MAT_HADAMARD","type_a=f32,type_b=f32,m=256,n=1,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","SYCL" +"SYCL0","MUL_MAT_HADAMARD","type_a=f32,type_b=f32,m=512,n=1,k=512,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","SYCL" "SYCL0","MUL_MAT_HADAMARD","type_a=f32,type_b=f32,m=128,n=32,k=128,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","SYCL" "SYCL0","MUL_MAT_HADAMARD","type_a=f32,type_b=f32,m=128,n=4,k=128,bs=[2,3],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","SYCL" "SYCL0","MUL_MAT","type_a=f32,type_b=f32,m=16,n=1,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","SYCL" @@ -7603,6 +7642,31 @@ "SYCL0","MUL_MAT_ID","type_a=f16,type_b=f32,n_mats=16,n_used=16,b=1,m=50,n=200,k=64","support","1","yes","SYCL" "SYCL0","MUL_MAT_ID","type_a=f16,type_b=f32,n_mats=1,n_used=1,b=0,m=8,n=16,k=1","support","1","yes","SYCL" "SYCL0","MUL_MAT_ID","type_a=mxfp4,type_b=f32,n_mats=32,n_used=2,b=0,m=2880,n=32,k=2880","support","1","yes","SYCL" +"SYCL0","MUL_MAT_ID","type_a=f32,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=3","support","1","yes","SYCL" +"SYCL0","MUL_MAT_ID","type_a=f16,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=3","support","1","yes","SYCL" +"SYCL0","MUL_MAT_ID","type_a=bf16,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=3","support","1","yes","SYCL" +"SYCL0","MUL_MAT_ID","type_a=q4_0,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=96","support","1","yes","SYCL" +"SYCL0","MUL_MAT_ID","type_a=q4_1,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=96","support","1","yes","SYCL" +"SYCL0","MUL_MAT_ID","type_a=q5_0,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=96","support","1","yes","SYCL" +"SYCL0","MUL_MAT_ID","type_a=q5_1,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=96","support","1","yes","SYCL" +"SYCL0","MUL_MAT_ID","type_a=q8_0,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=96","support","1","yes","SYCL" +"SYCL0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=384","support","0","no","SYCL" +"SYCL0","MUL_MAT_ID","type_a=mxfp4,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=96","support","1","yes","SYCL" +"SYCL0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=192","support","1","yes","SYCL" +"SYCL0","MUL_MAT_ID","type_a=q2_K,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=768","support","1","yes","SYCL" +"SYCL0","MUL_MAT_ID","type_a=q3_K,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=768","support","1","yes","SYCL" +"SYCL0","MUL_MAT_ID","type_a=q4_K,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=768","support","1","yes","SYCL" +"SYCL0","MUL_MAT_ID","type_a=q5_K,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=768","support","1","yes","SYCL" +"SYCL0","MUL_MAT_ID","type_a=q6_K,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=768","support","1","yes","SYCL" +"SYCL0","MUL_MAT_ID","type_a=iq2_xxs,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=768","support","1","yes","SYCL" +"SYCL0","MUL_MAT_ID","type_a=iq2_xs,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=768","support","1","yes","SYCL" +"SYCL0","MUL_MAT_ID","type_a=iq2_s,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=768","support","1","yes","SYCL" +"SYCL0","MUL_MAT_ID","type_a=iq3_xxs,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=768","support","1","yes","SYCL" +"SYCL0","MUL_MAT_ID","type_a=iq1_s,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=768","support","1","yes","SYCL" +"SYCL0","MUL_MAT_ID","type_a=iq1_m,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=768","support","1","yes","SYCL" +"SYCL0","MUL_MAT_ID","type_a=iq4_nl,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=96","support","1","yes","SYCL" +"SYCL0","MUL_MAT_ID","type_a=iq3_s,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=768","support","1","yes","SYCL" +"SYCL0","MUL_MAT_ID","type_a=iq4_xs,type_b=f32,n_mats=4,n_used=2,b=0,m=64,n=16,k=768","support","1","yes","SYCL" "SYCL0","MUL_MAT_ID","type_a=f32,type_b=f32,n_mats=4,n_used=1,b=0,m=512,n=1,k=256","support","1","yes","SYCL" "SYCL0","MUL_MAT_ID","type_a=f32,type_b=f32,n_mats=4,n_used=1,b=0,m=512,n=4,k=256","support","1","yes","SYCL" "SYCL0","MUL_MAT_ID","type_a=f32,type_b=f32,n_mats=4,n_used=1,b=0,m=512,n=5,k=256","support","1","yes","SYCL" @@ -9679,39 +9743,39 @@ "SYCL0","ADD_ID","type_a=f32,type_b=f32,n_embd=129,n_experts=8,n_experts_used=4,n_token=1","support","1","yes","SYCL" "SYCL0","ADD_ID","type_a=f32,type_b=f32,n_embd=129,n_experts=8,n_experts_used=4,n_token=32","support","1","yes","SYCL" "SYCL0","ADD_ID","type_a=f32,type_b=f32,n_embd=129,n_experts=8,n_experts_used=4,n_token=129","support","1","yes","SYCL" -"SYCL0","SQR","type=f16,ne=[10,5,4,3]","support","0","no","SYCL" -"SYCL0","SQRT","type=f16,ne=[10,3,3,2]","support","0","no","SYCL" -"SYCL0","LOG","type=f16,ne=[10,5,4,3]","support","0","no","SYCL" -"SYCL0","SIN","type=f16,ne=[10,2,2,2]","support","0","no","SYCL" -"SYCL0","COS","type=f16,ne=[10,2,2,2]","support","0","no","SYCL" -"SYCL0","CLAMP","type=f16,ne=[10,5,4,3],min=-0.500000,max=0.500000","support","0","no","SYCL" +"SYCL0","SQR","type=f16,ne=[10,5,4,3]","support","1","yes","SYCL" +"SYCL0","SQRT","type=f16,ne=[10,3,3,2]","support","1","yes","SYCL" +"SYCL0","LOG","type=f16,ne=[10,5,4,3]","support","1","yes","SYCL" +"SYCL0","SIN","type=f16,ne=[10,2,2,2]","support","1","yes","SYCL" +"SYCL0","COS","type=f16,ne=[10,2,2,2]","support","1","yes","SYCL" +"SYCL0","CLAMP","type=f16,ne=[10,5,4,3],min=-0.500000,max=0.500000","support","1","yes","SYCL" "SYCL0","LEAKY_RELU","type=f16,ne_a=[10,5,4,3],negative_slope=0.100000","support","1","yes","SYCL" -"SYCL0","FLOOR","type=f16,ne=[10,2,2,2]","support","0","no","SYCL" +"SYCL0","FLOOR","type=f16,ne=[10,2,2,2]","support","1","yes","SYCL" "SYCL0","CEIL","type=f16,ne=[10,2,2,2]","support","1","yes","SYCL" -"SYCL0","ROUND","type=f16,ne=[10,2,2,2]","support","0","no","SYCL" -"SYCL0","TRUNC","type=f16,ne=[10,2,2,2]","support","0","no","SYCL" -"SYCL0","SQR","type=f16,ne=[7,1,5,3]","support","0","no","SYCL" -"SYCL0","SQR","type=f16,ne=[1024,1024,1,1]","support","0","no","SYCL" -"SYCL0","SQRT","type=f16,ne=[7,1,5,3]","support","0","no","SYCL" -"SYCL0","SQRT","type=f16,ne=[1024,1024,1,1]","support","0","no","SYCL" -"SYCL0","LOG","type=f16,ne=[7,1,5,3]","support","0","no","SYCL" -"SYCL0","LOG","type=f16,ne=[1024,1024,1,1]","support","0","no","SYCL" -"SYCL0","SIN","type=f16,ne=[7,1,5,3]","support","0","no","SYCL" -"SYCL0","SIN","type=f16,ne=[1024,1024,1,1]","support","0","no","SYCL" -"SYCL0","COS","type=f16,ne=[7,1,5,3]","support","0","no","SYCL" -"SYCL0","COS","type=f16,ne=[1024,1024,1,1]","support","0","no","SYCL" -"SYCL0","CLAMP","type=f16,ne=[7,1,5,3],min=-0.500000,max=0.500000","support","0","no","SYCL" -"SYCL0","CLAMP","type=f16,ne=[1024,1024,1,1],min=-0.500000,max=0.500000","support","0","no","SYCL" +"SYCL0","ROUND","type=f16,ne=[10,2,2,2]","support","1","yes","SYCL" +"SYCL0","TRUNC","type=f16,ne=[10,2,2,2]","support","1","yes","SYCL" +"SYCL0","SQR","type=f16,ne=[7,1,5,3]","support","1","yes","SYCL" +"SYCL0","SQR","type=f16,ne=[1024,1024,1,1]","support","1","yes","SYCL" +"SYCL0","SQRT","type=f16,ne=[7,1,5,3]","support","1","yes","SYCL" +"SYCL0","SQRT","type=f16,ne=[1024,1024,1,1]","support","1","yes","SYCL" +"SYCL0","LOG","type=f16,ne=[7,1,5,3]","support","1","yes","SYCL" +"SYCL0","LOG","type=f16,ne=[1024,1024,1,1]","support","1","yes","SYCL" +"SYCL0","SIN","type=f16,ne=[7,1,5,3]","support","1","yes","SYCL" +"SYCL0","SIN","type=f16,ne=[1024,1024,1,1]","support","1","yes","SYCL" +"SYCL0","COS","type=f16,ne=[7,1,5,3]","support","1","yes","SYCL" +"SYCL0","COS","type=f16,ne=[1024,1024,1,1]","support","1","yes","SYCL" +"SYCL0","CLAMP","type=f16,ne=[7,1,5,3],min=-0.500000,max=0.500000","support","1","yes","SYCL" +"SYCL0","CLAMP","type=f16,ne=[1024,1024,1,1],min=-0.500000,max=0.500000","support","1","yes","SYCL" "SYCL0","LEAKY_RELU","type=f16,ne_a=[7,1,5,3],negative_slope=0.100000","support","1","yes","SYCL" "SYCL0","LEAKY_RELU","type=f16,ne_a=[1024,1024,1,1],negative_slope=0.100000","support","1","yes","SYCL" -"SYCL0","FLOOR","type=f16,ne=[7,1,5,3]","support","0","no","SYCL" -"SYCL0","FLOOR","type=f16,ne=[1024,1024,1,1]","support","0","no","SYCL" +"SYCL0","FLOOR","type=f16,ne=[7,1,5,3]","support","1","yes","SYCL" +"SYCL0","FLOOR","type=f16,ne=[1024,1024,1,1]","support","1","yes","SYCL" "SYCL0","CEIL","type=f16,ne=[7,1,5,3]","support","1","yes","SYCL" "SYCL0","CEIL","type=f16,ne=[1024,1024,1,1]","support","1","yes","SYCL" -"SYCL0","ROUND","type=f16,ne=[7,1,5,3]","support","0","no","SYCL" -"SYCL0","ROUND","type=f16,ne=[1024,1024,1,1]","support","0","no","SYCL" -"SYCL0","TRUNC","type=f16,ne=[7,1,5,3]","support","0","no","SYCL" -"SYCL0","TRUNC","type=f16,ne=[1024,1024,1,1]","support","0","no","SYCL" +"SYCL0","ROUND","type=f16,ne=[7,1,5,3]","support","1","yes","SYCL" +"SYCL0","ROUND","type=f16,ne=[1024,1024,1,1]","support","1","yes","SYCL" +"SYCL0","TRUNC","type=f16,ne=[7,1,5,3]","support","1","yes","SYCL" +"SYCL0","TRUNC","type=f16,ne=[1024,1024,1,1]","support","1","yes","SYCL" "SYCL0","SQR","type=f32,ne=[10,5,4,3]","support","1","yes","SYCL" "SYCL0","SQRT","type=f32,ne=[10,3,3,2]","support","1","yes","SYCL" "SYCL0","LOG","type=f32,ne=[10,5,4,3]","support","1","yes","SYCL" @@ -10845,37 +10909,117 @@ "SYCL0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=24,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=1,inplace=1","support","1","yes","SYCL" "SYCL0","ROPE","type=f16,ne_a=[128,32,2,3],n_dims=128,mode=24,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=1,inplace=1","support","1","yes","SYCL" "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=0","support","1","yes","SYCL" +"SYCL0","CONCAT","type=f16,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=0","support","1","yes","SYCL" +"SYCL0","CONCAT","type=bf16,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=0","support","1","yes","SYCL" +"SYCL0","CONCAT","type=i8,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=0","support","1","yes","SYCL" +"SYCL0","CONCAT","type=i16,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=0","support","1","yes","SYCL" "SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=0","support","1","yes","SYCL" +"SYCL0","CONCAT","type=i64,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=0","support","1","yes","SYCL" "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=0","support","1","yes","SYCL" +"SYCL0","CONCAT","type=f16,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=0","support","1","yes","SYCL" +"SYCL0","CONCAT","type=bf16,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=0","support","1","yes","SYCL" +"SYCL0","CONCAT","type=i8,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=0","support","1","yes","SYCL" +"SYCL0","CONCAT","type=i16,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=0","support","1","yes","SYCL" "SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=0","support","1","yes","SYCL" +"SYCL0","CONCAT","type=i64,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=0","support","1","yes","SYCL" "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=0","support","1","yes","SYCL" +"SYCL0","CONCAT","type=f16,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=0","support","1","yes","SYCL" +"SYCL0","CONCAT","type=bf16,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=0","support","1","yes","SYCL" +"SYCL0","CONCAT","type=i8,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=0","support","1","yes","SYCL" +"SYCL0","CONCAT","type=i16,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=0","support","1","yes","SYCL" "SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=0","support","1","yes","SYCL" +"SYCL0","CONCAT","type=i64,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=0","support","1","yes","SYCL" "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=0","support","1","yes","SYCL" +"SYCL0","CONCAT","type=f16,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=0","support","1","yes","SYCL" +"SYCL0","CONCAT","type=bf16,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=0","support","1","yes","SYCL" +"SYCL0","CONCAT","type=i8,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=0","support","1","yes","SYCL" +"SYCL0","CONCAT","type=i16,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=0","support","1","yes","SYCL" "SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=0","support","1","yes","SYCL" +"SYCL0","CONCAT","type=i64,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=0","support","1","yes","SYCL" "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=1","support","1","yes","SYCL" +"SYCL0","CONCAT","type=f16,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=1","support","1","yes","SYCL" +"SYCL0","CONCAT","type=bf16,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=1","support","1","yes","SYCL" +"SYCL0","CONCAT","type=i8,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=1","support","1","yes","SYCL" +"SYCL0","CONCAT","type=i16,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=1","support","1","yes","SYCL" "SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=1","support","1","yes","SYCL" +"SYCL0","CONCAT","type=i64,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=1","support","1","yes","SYCL" "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=1","support","1","yes","SYCL" +"SYCL0","CONCAT","type=f16,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=1","support","1","yes","SYCL" +"SYCL0","CONCAT","type=bf16,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=1","support","1","yes","SYCL" +"SYCL0","CONCAT","type=i8,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=1","support","1","yes","SYCL" +"SYCL0","CONCAT","type=i16,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=1","support","1","yes","SYCL" "SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=1","support","1","yes","SYCL" +"SYCL0","CONCAT","type=i64,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=1","support","1","yes","SYCL" "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=1","support","1","yes","SYCL" +"SYCL0","CONCAT","type=f16,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=1","support","1","yes","SYCL" +"SYCL0","CONCAT","type=bf16,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=1","support","1","yes","SYCL" +"SYCL0","CONCAT","type=i8,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=1","support","1","yes","SYCL" +"SYCL0","CONCAT","type=i16,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=1","support","1","yes","SYCL" "SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=1","support","1","yes","SYCL" +"SYCL0","CONCAT","type=i64,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=1","support","1","yes","SYCL" "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=1","support","1","yes","SYCL" +"SYCL0","CONCAT","type=f16,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=1","support","1","yes","SYCL" +"SYCL0","CONCAT","type=bf16,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=1","support","1","yes","SYCL" +"SYCL0","CONCAT","type=i8,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=1","support","1","yes","SYCL" +"SYCL0","CONCAT","type=i16,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=1","support","1","yes","SYCL" "SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=1","support","1","yes","SYCL" +"SYCL0","CONCAT","type=i64,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=1","support","1","yes","SYCL" "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=2","support","1","yes","SYCL" +"SYCL0","CONCAT","type=f16,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=2","support","1","yes","SYCL" +"SYCL0","CONCAT","type=bf16,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=2","support","1","yes","SYCL" +"SYCL0","CONCAT","type=i8,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=2","support","1","yes","SYCL" +"SYCL0","CONCAT","type=i16,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=2","support","1","yes","SYCL" "SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=2","support","1","yes","SYCL" +"SYCL0","CONCAT","type=i64,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=2","support","1","yes","SYCL" "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=2","support","1","yes","SYCL" +"SYCL0","CONCAT","type=f16,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=2","support","1","yes","SYCL" +"SYCL0","CONCAT","type=bf16,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=2","support","1","yes","SYCL" +"SYCL0","CONCAT","type=i8,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=2","support","1","yes","SYCL" +"SYCL0","CONCAT","type=i16,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=2","support","1","yes","SYCL" "SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=2","support","1","yes","SYCL" +"SYCL0","CONCAT","type=i64,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=2","support","1","yes","SYCL" "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=2","support","1","yes","SYCL" +"SYCL0","CONCAT","type=f16,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=2","support","1","yes","SYCL" +"SYCL0","CONCAT","type=bf16,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=2","support","1","yes","SYCL" +"SYCL0","CONCAT","type=i8,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=2","support","1","yes","SYCL" +"SYCL0","CONCAT","type=i16,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=2","support","1","yes","SYCL" "SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=2","support","1","yes","SYCL" +"SYCL0","CONCAT","type=i64,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=2","support","1","yes","SYCL" "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=2","support","1","yes","SYCL" +"SYCL0","CONCAT","type=f16,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=2","support","1","yes","SYCL" +"SYCL0","CONCAT","type=bf16,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=2","support","1","yes","SYCL" +"SYCL0","CONCAT","type=i8,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=2","support","1","yes","SYCL" +"SYCL0","CONCAT","type=i16,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=2","support","1","yes","SYCL" "SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=2","support","1","yes","SYCL" +"SYCL0","CONCAT","type=i64,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=2","support","1","yes","SYCL" "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=3","support","1","yes","SYCL" +"SYCL0","CONCAT","type=f16,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=3","support","1","yes","SYCL" +"SYCL0","CONCAT","type=bf16,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=3","support","1","yes","SYCL" +"SYCL0","CONCAT","type=i8,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=3","support","1","yes","SYCL" +"SYCL0","CONCAT","type=i16,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=3","support","1","yes","SYCL" "SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=3","support","1","yes","SYCL" +"SYCL0","CONCAT","type=i64,ne_a=[11,12,13,14],ne_b_d=7,dim=0,v=3","support","1","yes","SYCL" "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=3","support","1","yes","SYCL" +"SYCL0","CONCAT","type=f16,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=3","support","1","yes","SYCL" +"SYCL0","CONCAT","type=bf16,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=3","support","1","yes","SYCL" +"SYCL0","CONCAT","type=i8,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=3","support","1","yes","SYCL" +"SYCL0","CONCAT","type=i16,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=3","support","1","yes","SYCL" "SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=3","support","1","yes","SYCL" +"SYCL0","CONCAT","type=i64,ne_a=[11,12,13,14],ne_b_d=7,dim=1,v=3","support","1","yes","SYCL" "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","1","yes","SYCL" +"SYCL0","CONCAT","type=f16,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","1","yes","SYCL" +"SYCL0","CONCAT","type=bf16,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","1","yes","SYCL" +"SYCL0","CONCAT","type=i8,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","1","yes","SYCL" +"SYCL0","CONCAT","type=i16,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","1","yes","SYCL" "SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","1","yes","SYCL" +"SYCL0","CONCAT","type=i64,ne_a=[11,12,13,14],ne_b_d=7,dim=2,v=3","support","1","yes","SYCL" "SYCL0","CONCAT","type=f32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","1","yes","SYCL" +"SYCL0","CONCAT","type=f16,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","1","yes","SYCL" +"SYCL0","CONCAT","type=bf16,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","1","yes","SYCL" +"SYCL0","CONCAT","type=i8,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","1","yes","SYCL" +"SYCL0","CONCAT","type=i16,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","1","yes","SYCL" "SYCL0","CONCAT","type=i32,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","1","yes","SYCL" +"SYCL0","CONCAT","type=i64,ne_a=[11,12,13,14],ne_b_d=7,dim=3,v=3","support","1","yes","SYCL" "SYCL0","ARGSORT","type=f32,ne=[3,1,1,1],order=0","support","1","yes","SYCL" "SYCL0","ARGSORT","type=f32,ne=[4,1,1,1],order=0","support","1","yes","SYCL" "SYCL0","ARGSORT","type=f32,ne=[7,1,1,1],order=0","support","1","yes","SYCL" @@ -16515,6 +16659,7 @@ "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=128,nb=2,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=f16,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=96,nb=2,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=96,nb=2,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f32,permute=[0,1,2,3]","support","1","yes","SYCL" +"SYCL0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=256,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=96,nb=2,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q1_0,type_V=q1_0,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=128,hsv=64,nh=4,nr23=[1,1],kv=128,nb=2,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q1_0,type_V=q4_0,permute=[0,1,2,3]","support","0","no","SYCL" "SYCL0","FLASH_ATTN_EXT","hsk=64,hsv=128,nh=4,nr23=[1,1],kv=128,nb=2,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q1_0,permute=[0,1,2,3]","support","0","no","SYCL" diff --git a/docs/ops/Vulkan.csv b/docs/ops/Vulkan.csv index e693154968d3..3aa7976a2087 100644 --- a/docs/ops/Vulkan.csv +++ b/docs/ops/Vulkan.csv @@ -27,8 +27,8 @@ "Vulkan0","HARDSIGMOID","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan" "Vulkan0","EXP","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan" "Vulkan0","EXP","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan" -"Vulkan0","EXPM1","type=f16,ne_a=[128,2,2,2],v=0","support","0","no","Vulkan" -"Vulkan0","EXPM1","type=f16,ne_a=[5,7,11,13],v=0","support","0","no","Vulkan" +"Vulkan0","EXPM1","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan" +"Vulkan0","EXPM1","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan" "Vulkan0","SOFTPLUS","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan" "Vulkan0","SOFTPLUS","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan" "Vulkan0","GELU_ERF","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan" @@ -41,48 +41,48 @@ "Vulkan0","ROUND","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan" "Vulkan0","TRUNC","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan" "Vulkan0","TRUNC","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan" -"Vulkan0","ABS","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","Vulkan" -"Vulkan0","ABS","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","Vulkan" -"Vulkan0","SGN","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","Vulkan" -"Vulkan0","SGN","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","Vulkan" -"Vulkan0","NEG","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","Vulkan" -"Vulkan0","NEG","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","Vulkan" -"Vulkan0","STEP","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","Vulkan" -"Vulkan0","STEP","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","Vulkan" -"Vulkan0","TANH","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","Vulkan" -"Vulkan0","TANH","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","Vulkan" -"Vulkan0","ELU","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","Vulkan" -"Vulkan0","ELU","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","Vulkan" -"Vulkan0","RELU","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","Vulkan" -"Vulkan0","RELU","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","Vulkan" -"Vulkan0","SIGMOID","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","Vulkan" -"Vulkan0","SIGMOID","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","Vulkan" -"Vulkan0","GELU","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","Vulkan" -"Vulkan0","GELU","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","Vulkan" -"Vulkan0","GELU_QUICK","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","Vulkan" -"Vulkan0","GELU_QUICK","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","Vulkan" -"Vulkan0","SILU","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","Vulkan" -"Vulkan0","SILU","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","Vulkan" -"Vulkan0","HARDSWISH","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","Vulkan" -"Vulkan0","HARDSWISH","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","Vulkan" -"Vulkan0","HARDSIGMOID","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","Vulkan" -"Vulkan0","HARDSIGMOID","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","Vulkan" -"Vulkan0","EXP","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","Vulkan" -"Vulkan0","EXP","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","Vulkan" -"Vulkan0","EXPM1","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","Vulkan" -"Vulkan0","EXPM1","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","Vulkan" -"Vulkan0","SOFTPLUS","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","Vulkan" -"Vulkan0","SOFTPLUS","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","Vulkan" -"Vulkan0","GELU_ERF","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","Vulkan" -"Vulkan0","GELU_ERF","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","Vulkan" -"Vulkan0","FLOOR","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","Vulkan" -"Vulkan0","FLOOR","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","Vulkan" -"Vulkan0","CEIL","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","Vulkan" -"Vulkan0","CEIL","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","Vulkan" -"Vulkan0","ROUND","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","Vulkan" -"Vulkan0","ROUND","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","Vulkan" -"Vulkan0","TRUNC","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","Vulkan" -"Vulkan0","TRUNC","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","Vulkan" +"Vulkan0","ABS","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","Vulkan" +"Vulkan0","ABS","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","Vulkan" +"Vulkan0","SGN","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","Vulkan" +"Vulkan0","SGN","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","Vulkan" +"Vulkan0","NEG","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","Vulkan" +"Vulkan0","NEG","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","Vulkan" +"Vulkan0","STEP","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","Vulkan" +"Vulkan0","STEP","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","Vulkan" +"Vulkan0","TANH","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","Vulkan" +"Vulkan0","TANH","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","Vulkan" +"Vulkan0","ELU","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","Vulkan" +"Vulkan0","ELU","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","Vulkan" +"Vulkan0","RELU","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","Vulkan" +"Vulkan0","RELU","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","Vulkan" +"Vulkan0","SIGMOID","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","Vulkan" +"Vulkan0","SIGMOID","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","Vulkan" +"Vulkan0","GELU","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","Vulkan" +"Vulkan0","GELU","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","Vulkan" +"Vulkan0","GELU_QUICK","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","Vulkan" +"Vulkan0","GELU_QUICK","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","Vulkan" +"Vulkan0","SILU","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","Vulkan" +"Vulkan0","SILU","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","Vulkan" +"Vulkan0","HARDSWISH","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","Vulkan" +"Vulkan0","HARDSWISH","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","Vulkan" +"Vulkan0","HARDSIGMOID","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","Vulkan" +"Vulkan0","HARDSIGMOID","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","Vulkan" +"Vulkan0","EXP","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","Vulkan" +"Vulkan0","EXP","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","Vulkan" +"Vulkan0","EXPM1","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","Vulkan" +"Vulkan0","EXPM1","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","Vulkan" +"Vulkan0","SOFTPLUS","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","Vulkan" +"Vulkan0","SOFTPLUS","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","Vulkan" +"Vulkan0","GELU_ERF","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","Vulkan" +"Vulkan0","GELU_ERF","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","Vulkan" +"Vulkan0","FLOOR","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","Vulkan" +"Vulkan0","FLOOR","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","Vulkan" +"Vulkan0","CEIL","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","Vulkan" +"Vulkan0","CEIL","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","Vulkan" +"Vulkan0","ROUND","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","Vulkan" +"Vulkan0","ROUND","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","Vulkan" +"Vulkan0","TRUNC","type=f16,ne_a=[128,2,2,2],v=1","support","1","yes","Vulkan" +"Vulkan0","TRUNC","type=f16,ne_a=[5,7,11,13],v=1","support","1","yes","Vulkan" "Vulkan0","ABS","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan" "Vulkan0","ABS","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan" "Vulkan0","SGN","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan" @@ -111,8 +111,8 @@ "Vulkan0","HARDSIGMOID","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan" "Vulkan0","EXP","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan" "Vulkan0","EXP","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan" -"Vulkan0","EXPM1","type=f32,ne_a=[128,2,2,2],v=0","support","0","no","Vulkan" -"Vulkan0","EXPM1","type=f32,ne_a=[5,7,11,13],v=0","support","0","no","Vulkan" +"Vulkan0","EXPM1","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan" +"Vulkan0","EXPM1","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan" "Vulkan0","SOFTPLUS","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan" "Vulkan0","SOFTPLUS","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan" "Vulkan0","GELU_ERF","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan" @@ -125,48 +125,48 @@ "Vulkan0","ROUND","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan" "Vulkan0","TRUNC","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","Vulkan" "Vulkan0","TRUNC","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","Vulkan" -"Vulkan0","ABS","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","Vulkan" -"Vulkan0","ABS","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","Vulkan" -"Vulkan0","SGN","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","Vulkan" -"Vulkan0","SGN","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","Vulkan" -"Vulkan0","NEG","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","Vulkan" -"Vulkan0","NEG","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","Vulkan" -"Vulkan0","STEP","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","Vulkan" -"Vulkan0","STEP","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","Vulkan" -"Vulkan0","TANH","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","Vulkan" -"Vulkan0","TANH","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","Vulkan" -"Vulkan0","ELU","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","Vulkan" -"Vulkan0","ELU","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","Vulkan" -"Vulkan0","RELU","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","Vulkan" -"Vulkan0","RELU","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","Vulkan" -"Vulkan0","SIGMOID","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","Vulkan" -"Vulkan0","SIGMOID","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","Vulkan" -"Vulkan0","GELU","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","Vulkan" -"Vulkan0","GELU","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","Vulkan" -"Vulkan0","GELU_QUICK","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","Vulkan" -"Vulkan0","GELU_QUICK","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","Vulkan" -"Vulkan0","SILU","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","Vulkan" -"Vulkan0","SILU","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","Vulkan" -"Vulkan0","HARDSWISH","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","Vulkan" -"Vulkan0","HARDSWISH","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","Vulkan" -"Vulkan0","HARDSIGMOID","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","Vulkan" -"Vulkan0","HARDSIGMOID","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","Vulkan" -"Vulkan0","EXP","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","Vulkan" -"Vulkan0","EXP","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","Vulkan" -"Vulkan0","EXPM1","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","Vulkan" -"Vulkan0","EXPM1","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","Vulkan" -"Vulkan0","SOFTPLUS","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","Vulkan" -"Vulkan0","SOFTPLUS","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","Vulkan" -"Vulkan0","GELU_ERF","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","Vulkan" -"Vulkan0","GELU_ERF","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","Vulkan" -"Vulkan0","FLOOR","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","Vulkan" -"Vulkan0","FLOOR","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","Vulkan" -"Vulkan0","CEIL","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","Vulkan" -"Vulkan0","CEIL","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","Vulkan" -"Vulkan0","ROUND","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","Vulkan" -"Vulkan0","ROUND","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","Vulkan" -"Vulkan0","TRUNC","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","Vulkan" -"Vulkan0","TRUNC","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","Vulkan" +"Vulkan0","ABS","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","Vulkan" +"Vulkan0","ABS","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","Vulkan" +"Vulkan0","SGN","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","Vulkan" +"Vulkan0","SGN","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","Vulkan" +"Vulkan0","NEG","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","Vulkan" +"Vulkan0","NEG","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","Vulkan" +"Vulkan0","STEP","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","Vulkan" +"Vulkan0","STEP","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","Vulkan" +"Vulkan0","TANH","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","Vulkan" +"Vulkan0","TANH","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","Vulkan" +"Vulkan0","ELU","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","Vulkan" +"Vulkan0","ELU","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","Vulkan" +"Vulkan0","RELU","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","Vulkan" +"Vulkan0","RELU","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","Vulkan" +"Vulkan0","SIGMOID","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","Vulkan" +"Vulkan0","SIGMOID","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","Vulkan" +"Vulkan0","GELU","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","Vulkan" +"Vulkan0","GELU","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","Vulkan" +"Vulkan0","GELU_QUICK","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","Vulkan" +"Vulkan0","GELU_QUICK","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","Vulkan" +"Vulkan0","SILU","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","Vulkan" +"Vulkan0","SILU","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","Vulkan" +"Vulkan0","HARDSWISH","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","Vulkan" +"Vulkan0","HARDSWISH","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","Vulkan" +"Vulkan0","HARDSIGMOID","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","Vulkan" +"Vulkan0","HARDSIGMOID","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","Vulkan" +"Vulkan0","EXP","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","Vulkan" +"Vulkan0","EXP","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","Vulkan" +"Vulkan0","EXPM1","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","Vulkan" +"Vulkan0","EXPM1","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","Vulkan" +"Vulkan0","SOFTPLUS","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","Vulkan" +"Vulkan0","SOFTPLUS","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","Vulkan" +"Vulkan0","GELU_ERF","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","Vulkan" +"Vulkan0","GELU_ERF","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","Vulkan" +"Vulkan0","FLOOR","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","Vulkan" +"Vulkan0","FLOOR","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","Vulkan" +"Vulkan0","CEIL","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","Vulkan" +"Vulkan0","CEIL","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","Vulkan" +"Vulkan0","ROUND","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","Vulkan" +"Vulkan0","ROUND","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","Vulkan" +"Vulkan0","TRUNC","type=f32,ne_a=[128,2,2,2],v=1","support","1","yes","Vulkan" +"Vulkan0","TRUNC","type=f32,ne_a=[5,7,11,13],v=1","support","1","yes","Vulkan" "Vulkan0","REGLU","type=f16,ne_a=[128,2,2,2],v=0,swapped=0","support","1","yes","Vulkan" "Vulkan0","REGLU","type=f16,ne_a=[5,7,11,13],v=0,swapped=0","support","1","yes","Vulkan" "Vulkan0","REGLU","type=f16,ne_a=[128,2,2,2],v=0,swapped=1","support","1","yes","Vulkan" @@ -197,36 +197,36 @@ "Vulkan0","GEGLU_QUICK","type=f16,ne_a=[5,7,11,13],v=0,swapped=1","support","1","yes","Vulkan" "Vulkan0","GEGLU_QUICK","type=f16,ne_a=[128,2,2,2],v=0,split","support","1","yes","Vulkan" "Vulkan0","GEGLU_QUICK","type=f16,ne_a=[5,7,11,13],v=0,split","support","1","yes","Vulkan" -"Vulkan0","REGLU","type=f16,ne_a=[128,2,2,2],v=1,swapped=0","support","0","no","Vulkan" -"Vulkan0","REGLU","type=f16,ne_a=[5,7,11,13],v=1,swapped=0","support","0","no","Vulkan" -"Vulkan0","REGLU","type=f16,ne_a=[128,2,2,2],v=1,swapped=1","support","0","no","Vulkan" -"Vulkan0","REGLU","type=f16,ne_a=[5,7,11,13],v=1,swapped=1","support","0","no","Vulkan" -"Vulkan0","REGLU","type=f16,ne_a=[128,2,2,2],v=1,split","support","0","no","Vulkan" -"Vulkan0","REGLU","type=f16,ne_a=[5,7,11,13],v=1,split","support","0","no","Vulkan" -"Vulkan0","GEGLU","type=f16,ne_a=[128,2,2,2],v=1,swapped=0","support","0","no","Vulkan" -"Vulkan0","GEGLU","type=f16,ne_a=[5,7,11,13],v=1,swapped=0","support","0","no","Vulkan" -"Vulkan0","GEGLU","type=f16,ne_a=[128,2,2,2],v=1,swapped=1","support","0","no","Vulkan" -"Vulkan0","GEGLU","type=f16,ne_a=[5,7,11,13],v=1,swapped=1","support","0","no","Vulkan" -"Vulkan0","GEGLU","type=f16,ne_a=[128,2,2,2],v=1,split","support","0","no","Vulkan" -"Vulkan0","GEGLU","type=f16,ne_a=[5,7,11,13],v=1,split","support","0","no","Vulkan" -"Vulkan0","SWIGLU","type=f16,ne_a=[128,2,2,2],v=1,swapped=0","support","0","no","Vulkan" -"Vulkan0","SWIGLU","type=f16,ne_a=[5,7,11,13],v=1,swapped=0","support","0","no","Vulkan" -"Vulkan0","SWIGLU","type=f16,ne_a=[128,2,2,2],v=1,swapped=1","support","0","no","Vulkan" -"Vulkan0","SWIGLU","type=f16,ne_a=[5,7,11,13],v=1,swapped=1","support","0","no","Vulkan" -"Vulkan0","SWIGLU","type=f16,ne_a=[128,2,2,2],v=1,split","support","0","no","Vulkan" -"Vulkan0","SWIGLU","type=f16,ne_a=[5,7,11,13],v=1,split","support","0","no","Vulkan" -"Vulkan0","GEGLU_ERF","type=f16,ne_a=[128,2,2,2],v=1,swapped=0","support","0","no","Vulkan" -"Vulkan0","GEGLU_ERF","type=f16,ne_a=[5,7,11,13],v=1,swapped=0","support","0","no","Vulkan" -"Vulkan0","GEGLU_ERF","type=f16,ne_a=[128,2,2,2],v=1,swapped=1","support","0","no","Vulkan" -"Vulkan0","GEGLU_ERF","type=f16,ne_a=[5,7,11,13],v=1,swapped=1","support","0","no","Vulkan" -"Vulkan0","GEGLU_ERF","type=f16,ne_a=[128,2,2,2],v=1,split","support","0","no","Vulkan" -"Vulkan0","GEGLU_ERF","type=f16,ne_a=[5,7,11,13],v=1,split","support","0","no","Vulkan" -"Vulkan0","GEGLU_QUICK","type=f16,ne_a=[128,2,2,2],v=1,swapped=0","support","0","no","Vulkan" -"Vulkan0","GEGLU_QUICK","type=f16,ne_a=[5,7,11,13],v=1,swapped=0","support","0","no","Vulkan" -"Vulkan0","GEGLU_QUICK","type=f16,ne_a=[128,2,2,2],v=1,swapped=1","support","0","no","Vulkan" -"Vulkan0","GEGLU_QUICK","type=f16,ne_a=[5,7,11,13],v=1,swapped=1","support","0","no","Vulkan" -"Vulkan0","GEGLU_QUICK","type=f16,ne_a=[128,2,2,2],v=1,split","support","0","no","Vulkan" -"Vulkan0","GEGLU_QUICK","type=f16,ne_a=[5,7,11,13],v=1,split","support","0","no","Vulkan" +"Vulkan0","REGLU","type=f16,ne_a=[128,2,2,2],v=1,swapped=0","support","1","yes","Vulkan" +"Vulkan0","REGLU","type=f16,ne_a=[5,7,11,13],v=1,swapped=0","support","1","yes","Vulkan" +"Vulkan0","REGLU","type=f16,ne_a=[128,2,2,2],v=1,swapped=1","support","1","yes","Vulkan" +"Vulkan0","REGLU","type=f16,ne_a=[5,7,11,13],v=1,swapped=1","support","1","yes","Vulkan" +"Vulkan0","REGLU","type=f16,ne_a=[128,2,2,2],v=1,split","support","1","yes","Vulkan" +"Vulkan0","REGLU","type=f16,ne_a=[5,7,11,13],v=1,split","support","1","yes","Vulkan" +"Vulkan0","GEGLU","type=f16,ne_a=[128,2,2,2],v=1,swapped=0","support","1","yes","Vulkan" +"Vulkan0","GEGLU","type=f16,ne_a=[5,7,11,13],v=1,swapped=0","support","1","yes","Vulkan" +"Vulkan0","GEGLU","type=f16,ne_a=[128,2,2,2],v=1,swapped=1","support","1","yes","Vulkan" +"Vulkan0","GEGLU","type=f16,ne_a=[5,7,11,13],v=1,swapped=1","support","1","yes","Vulkan" +"Vulkan0","GEGLU","type=f16,ne_a=[128,2,2,2],v=1,split","support","1","yes","Vulkan" +"Vulkan0","GEGLU","type=f16,ne_a=[5,7,11,13],v=1,split","support","1","yes","Vulkan" +"Vulkan0","SWIGLU","type=f16,ne_a=[128,2,2,2],v=1,swapped=0","support","1","yes","Vulkan" +"Vulkan0","SWIGLU","type=f16,ne_a=[5,7,11,13],v=1,swapped=0","support","1","yes","Vulkan" +"Vulkan0","SWIGLU","type=f16,ne_a=[128,2,2,2],v=1,swapped=1","support","1","yes","Vulkan" +"Vulkan0","SWIGLU","type=f16,ne_a=[5,7,11,13],v=1,swapped=1","support","1","yes","Vulkan" +"Vulkan0","SWIGLU","type=f16,ne_a=[128,2,2,2],v=1,split","support","1","yes","Vulkan" +"Vulkan0","SWIGLU","type=f16,ne_a=[5,7,11,13],v=1,split","support","1","yes","Vulkan" +"Vulkan0","GEGLU_ERF","type=f16,ne_a=[128,2,2,2],v=1,swapped=0","support","1","yes","Vulkan" +"Vulkan0","GEGLU_ERF","type=f16,ne_a=[5,7,11,13],v=1,swapped=0","support","1","yes","Vulkan" +"Vulkan0","GEGLU_ERF","type=f16,ne_a=[128,2,2,2],v=1,swapped=1","support","1","yes","Vulkan" +"Vulkan0","GEGLU_ERF","type=f16,ne_a=[5,7,11,13],v=1,swapped=1","support","1","yes","Vulkan" +"Vulkan0","GEGLU_ERF","type=f16,ne_a=[128,2,2,2],v=1,split","support","1","yes","Vulkan" +"Vulkan0","GEGLU_ERF","type=f16,ne_a=[5,7,11,13],v=1,split","support","1","yes","Vulkan" +"Vulkan0","GEGLU_QUICK","type=f16,ne_a=[128,2,2,2],v=1,swapped=0","support","1","yes","Vulkan" +"Vulkan0","GEGLU_QUICK","type=f16,ne_a=[5,7,11,13],v=1,swapped=0","support","1","yes","Vulkan" +"Vulkan0","GEGLU_QUICK","type=f16,ne_a=[128,2,2,2],v=1,swapped=1","support","1","yes","Vulkan" +"Vulkan0","GEGLU_QUICK","type=f16,ne_a=[5,7,11,13],v=1,swapped=1","support","1","yes","Vulkan" +"Vulkan0","GEGLU_QUICK","type=f16,ne_a=[128,2,2,2],v=1,split","support","1","yes","Vulkan" +"Vulkan0","GEGLU_QUICK","type=f16,ne_a=[5,7,11,13],v=1,split","support","1","yes","Vulkan" "Vulkan0","REGLU","type=f32,ne_a=[128,2,2,2],v=0,swapped=0","support","1","yes","Vulkan" "Vulkan0","REGLU","type=f32,ne_a=[5,7,11,13],v=0,swapped=0","support","1","yes","Vulkan" "Vulkan0","REGLU","type=f32,ne_a=[128,2,2,2],v=0,swapped=1","support","1","yes","Vulkan" @@ -257,44 +257,44 @@ "Vulkan0","GEGLU_QUICK","type=f32,ne_a=[5,7,11,13],v=0,swapped=1","support","1","yes","Vulkan" "Vulkan0","GEGLU_QUICK","type=f32,ne_a=[128,2,2,2],v=0,split","support","1","yes","Vulkan" "Vulkan0","GEGLU_QUICK","type=f32,ne_a=[5,7,11,13],v=0,split","support","1","yes","Vulkan" -"Vulkan0","REGLU","type=f32,ne_a=[128,2,2,2],v=1,swapped=0","support","0","no","Vulkan" -"Vulkan0","REGLU","type=f32,ne_a=[5,7,11,13],v=1,swapped=0","support","0","no","Vulkan" -"Vulkan0","REGLU","type=f32,ne_a=[128,2,2,2],v=1,swapped=1","support","0","no","Vulkan" -"Vulkan0","REGLU","type=f32,ne_a=[5,7,11,13],v=1,swapped=1","support","0","no","Vulkan" -"Vulkan0","REGLU","type=f32,ne_a=[128,2,2,2],v=1,split","support","0","no","Vulkan" -"Vulkan0","REGLU","type=f32,ne_a=[5,7,11,13],v=1,split","support","0","no","Vulkan" -"Vulkan0","GEGLU","type=f32,ne_a=[128,2,2,2],v=1,swapped=0","support","0","no","Vulkan" -"Vulkan0","GEGLU","type=f32,ne_a=[5,7,11,13],v=1,swapped=0","support","0","no","Vulkan" -"Vulkan0","GEGLU","type=f32,ne_a=[128,2,2,2],v=1,swapped=1","support","0","no","Vulkan" -"Vulkan0","GEGLU","type=f32,ne_a=[5,7,11,13],v=1,swapped=1","support","0","no","Vulkan" -"Vulkan0","GEGLU","type=f32,ne_a=[128,2,2,2],v=1,split","support","0","no","Vulkan" -"Vulkan0","GEGLU","type=f32,ne_a=[5,7,11,13],v=1,split","support","0","no","Vulkan" -"Vulkan0","SWIGLU","type=f32,ne_a=[128,2,2,2],v=1,swapped=0","support","0","no","Vulkan" -"Vulkan0","SWIGLU","type=f32,ne_a=[5,7,11,13],v=1,swapped=0","support","0","no","Vulkan" -"Vulkan0","SWIGLU","type=f32,ne_a=[128,2,2,2],v=1,swapped=1","support","0","no","Vulkan" -"Vulkan0","SWIGLU","type=f32,ne_a=[5,7,11,13],v=1,swapped=1","support","0","no","Vulkan" -"Vulkan0","SWIGLU","type=f32,ne_a=[128,2,2,2],v=1,split","support","0","no","Vulkan" -"Vulkan0","SWIGLU","type=f32,ne_a=[5,7,11,13],v=1,split","support","0","no","Vulkan" -"Vulkan0","GEGLU_ERF","type=f32,ne_a=[128,2,2,2],v=1,swapped=0","support","0","no","Vulkan" -"Vulkan0","GEGLU_ERF","type=f32,ne_a=[5,7,11,13],v=1,swapped=0","support","0","no","Vulkan" -"Vulkan0","GEGLU_ERF","type=f32,ne_a=[128,2,2,2],v=1,swapped=1","support","0","no","Vulkan" -"Vulkan0","GEGLU_ERF","type=f32,ne_a=[5,7,11,13],v=1,swapped=1","support","0","no","Vulkan" -"Vulkan0","GEGLU_ERF","type=f32,ne_a=[128,2,2,2],v=1,split","support","0","no","Vulkan" -"Vulkan0","GEGLU_ERF","type=f32,ne_a=[5,7,11,13],v=1,split","support","0","no","Vulkan" -"Vulkan0","GEGLU_QUICK","type=f32,ne_a=[128,2,2,2],v=1,swapped=0","support","0","no","Vulkan" -"Vulkan0","GEGLU_QUICK","type=f32,ne_a=[5,7,11,13],v=1,swapped=0","support","0","no","Vulkan" -"Vulkan0","GEGLU_QUICK","type=f32,ne_a=[128,2,2,2],v=1,swapped=1","support","0","no","Vulkan" -"Vulkan0","GEGLU_QUICK","type=f32,ne_a=[5,7,11,13],v=1,swapped=1","support","0","no","Vulkan" -"Vulkan0","GEGLU_QUICK","type=f32,ne_a=[128,2,2,2],v=1,split","support","0","no","Vulkan" -"Vulkan0","GEGLU_QUICK","type=f32,ne_a=[5,7,11,13],v=1,split","support","0","no","Vulkan" +"Vulkan0","REGLU","type=f32,ne_a=[128,2,2,2],v=1,swapped=0","support","1","yes","Vulkan" +"Vulkan0","REGLU","type=f32,ne_a=[5,7,11,13],v=1,swapped=0","support","1","yes","Vulkan" +"Vulkan0","REGLU","type=f32,ne_a=[128,2,2,2],v=1,swapped=1","support","1","yes","Vulkan" +"Vulkan0","REGLU","type=f32,ne_a=[5,7,11,13],v=1,swapped=1","support","1","yes","Vulkan" +"Vulkan0","REGLU","type=f32,ne_a=[128,2,2,2],v=1,split","support","1","yes","Vulkan" +"Vulkan0","REGLU","type=f32,ne_a=[5,7,11,13],v=1,split","support","1","yes","Vulkan" +"Vulkan0","GEGLU","type=f32,ne_a=[128,2,2,2],v=1,swapped=0","support","1","yes","Vulkan" +"Vulkan0","GEGLU","type=f32,ne_a=[5,7,11,13],v=1,swapped=0","support","1","yes","Vulkan" +"Vulkan0","GEGLU","type=f32,ne_a=[128,2,2,2],v=1,swapped=1","support","1","yes","Vulkan" +"Vulkan0","GEGLU","type=f32,ne_a=[5,7,11,13],v=1,swapped=1","support","1","yes","Vulkan" +"Vulkan0","GEGLU","type=f32,ne_a=[128,2,2,2],v=1,split","support","1","yes","Vulkan" +"Vulkan0","GEGLU","type=f32,ne_a=[5,7,11,13],v=1,split","support","1","yes","Vulkan" +"Vulkan0","SWIGLU","type=f32,ne_a=[128,2,2,2],v=1,swapped=0","support","1","yes","Vulkan" +"Vulkan0","SWIGLU","type=f32,ne_a=[5,7,11,13],v=1,swapped=0","support","1","yes","Vulkan" +"Vulkan0","SWIGLU","type=f32,ne_a=[128,2,2,2],v=1,swapped=1","support","1","yes","Vulkan" +"Vulkan0","SWIGLU","type=f32,ne_a=[5,7,11,13],v=1,swapped=1","support","1","yes","Vulkan" +"Vulkan0","SWIGLU","type=f32,ne_a=[128,2,2,2],v=1,split","support","1","yes","Vulkan" +"Vulkan0","SWIGLU","type=f32,ne_a=[5,7,11,13],v=1,split","support","1","yes","Vulkan" +"Vulkan0","GEGLU_ERF","type=f32,ne_a=[128,2,2,2],v=1,swapped=0","support","1","yes","Vulkan" +"Vulkan0","GEGLU_ERF","type=f32,ne_a=[5,7,11,13],v=1,swapped=0","support","1","yes","Vulkan" +"Vulkan0","GEGLU_ERF","type=f32,ne_a=[128,2,2,2],v=1,swapped=1","support","1","yes","Vulkan" +"Vulkan0","GEGLU_ERF","type=f32,ne_a=[5,7,11,13],v=1,swapped=1","support","1","yes","Vulkan" +"Vulkan0","GEGLU_ERF","type=f32,ne_a=[128,2,2,2],v=1,split","support","1","yes","Vulkan" +"Vulkan0","GEGLU_ERF","type=f32,ne_a=[5,7,11,13],v=1,split","support","1","yes","Vulkan" +"Vulkan0","GEGLU_QUICK","type=f32,ne_a=[128,2,2,2],v=1,swapped=0","support","1","yes","Vulkan" +"Vulkan0","GEGLU_QUICK","type=f32,ne_a=[5,7,11,13],v=1,swapped=0","support","1","yes","Vulkan" +"Vulkan0","GEGLU_QUICK","type=f32,ne_a=[128,2,2,2],v=1,swapped=1","support","1","yes","Vulkan" +"Vulkan0","GEGLU_QUICK","type=f32,ne_a=[5,7,11,13],v=1,swapped=1","support","1","yes","Vulkan" +"Vulkan0","GEGLU_QUICK","type=f32,ne_a=[128,2,2,2],v=1,split","support","1","yes","Vulkan" +"Vulkan0","GEGLU_QUICK","type=f32,ne_a=[5,7,11,13],v=1,split","support","1","yes","Vulkan" "Vulkan0","SWIGLU_OAI","type=f32,ne_a=[128,2,2,2],v=0,alpha=0.500000,limit=2.000000","support","1","yes","Vulkan" "Vulkan0","SWIGLU_OAI","type=f32,ne_a=[128,2,2,2],v=0,alpha=0.500000,limit=7.000000","support","1","yes","Vulkan" "Vulkan0","SWIGLU_OAI","type=f32,ne_a=[128,2,2,2],v=0,alpha=1.702000,limit=2.000000","support","1","yes","Vulkan" "Vulkan0","SWIGLU_OAI","type=f32,ne_a=[128,2,2,2],v=0,alpha=1.702000,limit=7.000000","support","1","yes","Vulkan" -"Vulkan0","SWIGLU_OAI","type=f32,ne_a=[128,2,2,2],v=1,alpha=0.500000,limit=2.000000","support","0","no","Vulkan" -"Vulkan0","SWIGLU_OAI","type=f32,ne_a=[128,2,2,2],v=1,alpha=0.500000,limit=7.000000","support","0","no","Vulkan" -"Vulkan0","SWIGLU_OAI","type=f32,ne_a=[128,2,2,2],v=1,alpha=1.702000,limit=2.000000","support","0","no","Vulkan" -"Vulkan0","SWIGLU_OAI","type=f32,ne_a=[128,2,2,2],v=1,alpha=1.702000,limit=7.000000","support","0","no","Vulkan" +"Vulkan0","SWIGLU_OAI","type=f32,ne_a=[128,2,2,2],v=1,alpha=0.500000,limit=2.000000","support","1","yes","Vulkan" +"Vulkan0","SWIGLU_OAI","type=f32,ne_a=[128,2,2,2],v=1,alpha=0.500000,limit=7.000000","support","1","yes","Vulkan" +"Vulkan0","SWIGLU_OAI","type=f32,ne_a=[128,2,2,2],v=1,alpha=1.702000,limit=2.000000","support","1","yes","Vulkan" +"Vulkan0","SWIGLU_OAI","type=f32,ne_a=[128,2,2,2],v=1,alpha=1.702000,limit=7.000000","support","1","yes","Vulkan" "Vulkan0","GET_ROWS","type=f32,n=76800,m=5,r=4,be1=1,be2=2,v=0","support","1","yes","Vulkan" "Vulkan0","GET_ROWS","type=f32,n=256,m=80000,r=70000,be1=2,be2=1,v=0","support","1","yes","Vulkan" "Vulkan0","GET_ROWS","type=f32,n=256,m=5,r=4,be1=700,be2=100,v=0","support","1","yes","Vulkan" @@ -334,10 +334,18 @@ "Vulkan0","GET_ROWS","type=q8_0,n=256,m=5,r=4,be1=1,be2=1,v=1","support","1","yes","Vulkan" "Vulkan0","GET_ROWS","type=q8_0,n=256,m=5,r=4,be1=7,be2=1,v=0","support","1","yes","Vulkan" "Vulkan0","GET_ROWS","type=q8_0,n=256,m=5,r=4,be1=7,be2=1,v=1","support","1","yes","Vulkan" +"Vulkan0","GET_ROWS","type=q1_0,n=256,m=5,r=4,be1=1,be2=1,v=0","support","1","yes","Vulkan" +"Vulkan0","GET_ROWS","type=q1_0,n=256,m=5,r=4,be1=1,be2=1,v=1","support","1","yes","Vulkan" +"Vulkan0","GET_ROWS","type=q1_0,n=256,m=5,r=4,be1=7,be2=1,v=0","support","1","yes","Vulkan" +"Vulkan0","GET_ROWS","type=q1_0,n=256,m=5,r=4,be1=7,be2=1,v=1","support","1","yes","Vulkan" "Vulkan0","GET_ROWS","type=mxfp4,n=256,m=5,r=4,be1=1,be2=1,v=0","support","1","yes","Vulkan" "Vulkan0","GET_ROWS","type=mxfp4,n=256,m=5,r=4,be1=1,be2=1,v=1","support","1","yes","Vulkan" "Vulkan0","GET_ROWS","type=mxfp4,n=256,m=5,r=4,be1=7,be2=1,v=0","support","1","yes","Vulkan" "Vulkan0","GET_ROWS","type=mxfp4,n=256,m=5,r=4,be1=7,be2=1,v=1","support","1","yes","Vulkan" +"Vulkan0","GET_ROWS","type=nvfp4,n=256,m=5,r=4,be1=1,be2=1,v=0","support","1","yes","Vulkan" +"Vulkan0","GET_ROWS","type=nvfp4,n=256,m=5,r=4,be1=1,be2=1,v=1","support","1","yes","Vulkan" +"Vulkan0","GET_ROWS","type=nvfp4,n=256,m=5,r=4,be1=7,be2=1,v=0","support","1","yes","Vulkan" +"Vulkan0","GET_ROWS","type=nvfp4,n=256,m=5,r=4,be1=7,be2=1,v=1","support","1","yes","Vulkan" "Vulkan0","GET_ROWS","type=q2_K,n=256,m=5,r=4,be1=1,be2=1,v=0","support","1","yes","Vulkan" "Vulkan0","GET_ROWS","type=q2_K,n=256,m=5,r=4,be1=1,be2=1,v=1","support","1","yes","Vulkan" "Vulkan0","GET_ROWS","type=q2_K,n=256,m=5,r=4,be1=7,be2=1,v=0","support","1","yes","Vulkan" @@ -415,8 +423,12 @@ "Vulkan0","GET_ROWS_BACK","type=q5_1,n=256,m=5,r=4,b=1,v=1","support","0","no","Vulkan" "Vulkan0","GET_ROWS_BACK","type=q8_0,n=256,m=5,r=4,b=1,v=0","support","0","no","Vulkan" "Vulkan0","GET_ROWS_BACK","type=q8_0,n=256,m=5,r=4,b=1,v=1","support","0","no","Vulkan" +"Vulkan0","GET_ROWS_BACK","type=q1_0,n=256,m=5,r=4,b=1,v=0","support","0","no","Vulkan" +"Vulkan0","GET_ROWS_BACK","type=q1_0,n=256,m=5,r=4,b=1,v=1","support","0","no","Vulkan" "Vulkan0","GET_ROWS_BACK","type=mxfp4,n=256,m=5,r=4,b=1,v=0","support","0","no","Vulkan" "Vulkan0","GET_ROWS_BACK","type=mxfp4,n=256,m=5,r=4,b=1,v=1","support","0","no","Vulkan" +"Vulkan0","GET_ROWS_BACK","type=nvfp4,n=256,m=5,r=4,b=1,v=0","support","0","no","Vulkan" +"Vulkan0","GET_ROWS_BACK","type=nvfp4,n=256,m=5,r=4,b=1,v=1","support","0","no","Vulkan" "Vulkan0","GET_ROWS_BACK","type=q2_K,n=256,m=5,r=4,b=1,v=0","support","0","no","Vulkan" "Vulkan0","GET_ROWS_BACK","type=q2_K,n=256,m=5,r=4,b=1,v=1","support","0","no","Vulkan" "Vulkan0","GET_ROWS_BACK","type=q3_K,n=256,m=5,r=4,b=1,v=0","support","0","no","Vulkan" @@ -570,6 +582,18 @@ "Vulkan0","SET_ROWS","type=q8_0,type_idx=i64,ne=[256,5,7,3],nr23=[1,1],r=1,v=1","support","1","yes","Vulkan" "Vulkan0","SET_ROWS","type=q8_0,type_idx=i64,ne=[256,11,1,7],nr23=[2,3],r=7,v=1","support","1","yes","Vulkan" "Vulkan0","SET_ROWS","type=q8_0,type_idx=i64,ne=[96,3,7,1],nr23=[2,3],r=2,v=1","support","1","yes","Vulkan" +"Vulkan0","SET_ROWS","type=q1_0,type_idx=i64,ne=[256,5,1,3],nr23=[1,1],r=1,v=0","support","1","yes","Vulkan" +"Vulkan0","SET_ROWS","type=q1_0,type_idx=i64,ne=[256,11,1,1],nr23=[2,3],r=7,v=0","support","1","yes","Vulkan" +"Vulkan0","SET_ROWS","type=q1_0,type_idx=i64,ne=[384,3,1,1],nr23=[2,3],r=2,v=0","support","1","yes","Vulkan" +"Vulkan0","SET_ROWS","type=q1_0,type_idx=i64,ne=[256,5,1,3],nr23=[1,1],r=1,v=1","support","1","yes","Vulkan" +"Vulkan0","SET_ROWS","type=q1_0,type_idx=i64,ne=[256,11,1,1],nr23=[2,3],r=7,v=1","support","1","yes","Vulkan" +"Vulkan0","SET_ROWS","type=q1_0,type_idx=i64,ne=[384,3,1,1],nr23=[2,3],r=2,v=1","support","1","yes","Vulkan" +"Vulkan0","SET_ROWS","type=q1_0,type_idx=i64,ne=[256,5,7,3],nr23=[1,1],r=1,v=0","support","1","yes","Vulkan" +"Vulkan0","SET_ROWS","type=q1_0,type_idx=i64,ne=[256,11,1,7],nr23=[2,3],r=7,v=0","support","1","yes","Vulkan" +"Vulkan0","SET_ROWS","type=q1_0,type_idx=i64,ne=[384,3,7,1],nr23=[2,3],r=2,v=0","support","1","yes","Vulkan" +"Vulkan0","SET_ROWS","type=q1_0,type_idx=i64,ne=[256,5,7,3],nr23=[1,1],r=1,v=1","support","1","yes","Vulkan" +"Vulkan0","SET_ROWS","type=q1_0,type_idx=i64,ne=[256,11,1,7],nr23=[2,3],r=7,v=1","support","1","yes","Vulkan" +"Vulkan0","SET_ROWS","type=q1_0,type_idx=i64,ne=[384,3,7,1],nr23=[2,3],r=2,v=1","support","1","yes","Vulkan" "Vulkan0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[256,5,1,3],nr23=[1,1],r=1,v=0","support","0","no","Vulkan" "Vulkan0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[256,11,1,1],nr23=[2,3],r=7,v=0","support","0","no","Vulkan" "Vulkan0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[96,3,1,1],nr23=[2,3],r=2,v=0","support","0","no","Vulkan" @@ -582,6 +606,18 @@ "Vulkan0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[256,5,7,3],nr23=[1,1],r=1,v=1","support","0","no","Vulkan" "Vulkan0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[256,11,1,7],nr23=[2,3],r=7,v=1","support","0","no","Vulkan" "Vulkan0","SET_ROWS","type=mxfp4,type_idx=i64,ne=[96,3,7,1],nr23=[2,3],r=2,v=1","support","0","no","Vulkan" +"Vulkan0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[256,5,1,3],nr23=[1,1],r=1,v=0","support","0","no","Vulkan" +"Vulkan0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[256,11,1,1],nr23=[2,3],r=7,v=0","support","0","no","Vulkan" +"Vulkan0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[192,3,1,1],nr23=[2,3],r=2,v=0","support","0","no","Vulkan" +"Vulkan0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[256,5,1,3],nr23=[1,1],r=1,v=1","support","0","no","Vulkan" +"Vulkan0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[256,11,1,1],nr23=[2,3],r=7,v=1","support","0","no","Vulkan" +"Vulkan0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[192,3,1,1],nr23=[2,3],r=2,v=1","support","0","no","Vulkan" +"Vulkan0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[256,5,7,3],nr23=[1,1],r=1,v=0","support","0","no","Vulkan" +"Vulkan0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[256,11,1,7],nr23=[2,3],r=7,v=0","support","0","no","Vulkan" +"Vulkan0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[192,3,7,1],nr23=[2,3],r=2,v=0","support","0","no","Vulkan" +"Vulkan0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[256,5,7,3],nr23=[1,1],r=1,v=1","support","0","no","Vulkan" +"Vulkan0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[256,11,1,7],nr23=[2,3],r=7,v=1","support","0","no","Vulkan" +"Vulkan0","SET_ROWS","type=nvfp4,type_idx=i64,ne=[192,3,7,1],nr23=[2,3],r=2,v=1","support","0","no","Vulkan" "Vulkan0","SET_ROWS","type=q2_K,type_idx=i64,ne=[256,5,1,3],nr23=[1,1],r=1,v=0","support","0","no","Vulkan" "Vulkan0","SET_ROWS","type=q2_K,type_idx=i64,ne=[256,11,1,1],nr23=[2,3],r=7,v=0","support","0","no","Vulkan" "Vulkan0","SET_ROWS","type=q2_K,type_idx=i64,ne=[768,3,1,1],nr23=[2,3],r=2,v=0","support","0","no","Vulkan" @@ -1014,6 +1050,8 @@ "Vulkan0","IM2COL","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[12,12,2,2560],ne_kernel=[3,3,2,2560],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1","support","1","yes","Vulkan" "Vulkan0","IM2COL","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[5,5,1,32],ne_kernel=[3,4,1,32],s0=1,s1=1,p0=0,p1=0,d0=1,d1=1,is_2D=1","support","1","yes","Vulkan" "Vulkan0","IM2COL","type_input=f32,type_kernel=f32,dst_type=f32,ne_input=[2,2,1536,729],ne_kernel=[2,2,1536,4096],s0=1,s1=1,p0=0,p1=0,d0=1,d1=1,is_2D=1","support","1","yes","Vulkan" +"Vulkan0","IM2COL","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[128,128,1,2],ne_kernel=[32,33,1,2],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1","support","1","yes","Vulkan" +"Vulkan0","IM2COL","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[128,128,2,1],ne_kernel=[33,34,2,1],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1","support","1","yes","Vulkan" "Vulkan0","IM2COL_3D","type_input=f32,type_kernel=f32,dst_type=f32,ne_input=[10,10,10,9],ne_kernel=[3,3,3,1],IC=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,v=0","support","1","yes","Vulkan" "Vulkan0","IM2COL_3D","type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[10,10,10,9],ne_kernel=[3,3,3,1],IC=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,v=0","support","1","yes","Vulkan" "Vulkan0","IM2COL_3D","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[10,10,10,9],ne_kernel=[3,3,3,1],IC=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,v=0","support","1","yes","Vulkan" @@ -5011,9 +5049,12 @@ "Vulkan0","CONV_TRANSPOSE_1D","ne_input=[3,2,1,1],ne_kernel=[3,2,2,1],s0=1,p0=0,d0=1","support","1","yes","Vulkan" "Vulkan0","CONV_TRANSPOSE_1D","ne_input=[3,2,1,1],ne_kernel=[3,1,2,1],s0=1,p0=0,d0=1","support","1","yes","Vulkan" "Vulkan0","CONV_TRANSPOSE_1D","ne_input=[2,1,1,1],ne_kernel=[3,1,1,1],s0=1,p0=0,d0=1","support","1","yes","Vulkan" -"Vulkan0","CONV_TRANSPOSE_2D","ne_input=[3,2,3,1],ne_kernel=[2,2,1,3],stride=1","support","1","yes","Vulkan" -"Vulkan0","CONV_TRANSPOSE_2D","ne_input=[10,10,9,1],ne_kernel=[3,3,1,9],stride=2","support","1","yes","Vulkan" -"Vulkan0","CONV_TRANSPOSE_2D","ne_input=[129,63,35,1],ne_kernel=[3,3,48,35],stride=1","support","1","yes","Vulkan" +"Vulkan0","CONV_TRANSPOSE_2D","kernel_type=f32,ne_input=[3,2,3,1],ne_kernel=[2,2,1,3],stride=1","support","1","yes","Vulkan" +"Vulkan0","CONV_TRANSPOSE_2D","kernel_type=f32,ne_input=[10,10,9,1],ne_kernel=[3,3,1,9],stride=2","support","1","yes","Vulkan" +"Vulkan0","CONV_TRANSPOSE_2D","kernel_type=f32,ne_input=[129,63,35,1],ne_kernel=[3,3,48,35],stride=1","support","1","yes","Vulkan" +"Vulkan0","CONV_TRANSPOSE_2D","kernel_type=f16,ne_input=[3,2,3,1],ne_kernel=[2,2,1,3],stride=1","support","1","yes","Vulkan" +"Vulkan0","CONV_TRANSPOSE_2D","kernel_type=f16,ne_input=[10,10,9,1],ne_kernel=[3,3,1,9],stride=2","support","1","yes","Vulkan" +"Vulkan0","CONV_TRANSPOSE_2D","kernel_type=f16,ne_input=[129,63,35,1],ne_kernel=[3,3,48,35],stride=1","support","1","yes","Vulkan" "Vulkan0","COUNT_EQUAL","type=f32,ne=[4,500,1,1]","support","1","yes","Vulkan" "Vulkan0","COUNT_EQUAL","type=f32,ne=[4,5000,1,1]","support","1","yes","Vulkan" "Vulkan0","ARGMAX","type=f32,ne=[32,1,1,1]","support","1","yes","Vulkan" @@ -5029,14 +5070,14 @@ "Vulkan0","REPEAT","type=f32,ne=[10,5,4,1],nr=[1,1,2,1]","support","1","yes","Vulkan" "Vulkan0","REPEAT","type=f32,ne=[10,5,4,1],nr=[1,1,1,2]","support","1","yes","Vulkan" "Vulkan0","REPEAT","type=i32,ne=[10,5,4,1],nr=[2,1,1,1]","support","1","yes","Vulkan" -"Vulkan0","REPEAT","type=i16,ne=[10,5,4,1],nr=[1,1,1,2]","support","0","no","Vulkan" +"Vulkan0","REPEAT","type=i16,ne=[10,5,4,1],nr=[1,1,1,2]","support","1","yes","Vulkan" "Vulkan0","REPEAT","type=f32,ne=[10,5,4,3],nr=[1,1,1,1]","support","1","yes","Vulkan" "Vulkan0","REPEAT","type=f32,ne=[10,5,4,3],nr=[2,1,1,1]","support","1","yes","Vulkan" "Vulkan0","REPEAT","type=f32,ne=[10,5,4,3],nr=[1,2,1,1]","support","1","yes","Vulkan" "Vulkan0","REPEAT","type=f32,ne=[10,5,4,3],nr=[1,1,2,1]","support","1","yes","Vulkan" "Vulkan0","REPEAT","type=f32,ne=[10,5,4,3],nr=[1,1,1,2]","support","1","yes","Vulkan" "Vulkan0","REPEAT","type=i32,ne=[10,5,4,3],nr=[2,1,1,1]","support","1","yes","Vulkan" -"Vulkan0","REPEAT","type=i16,ne=[10,5,4,3],nr=[1,1,1,2]","support","0","no","Vulkan" +"Vulkan0","REPEAT","type=i16,ne=[10,5,4,3],nr=[1,1,1,2]","support","1","yes","Vulkan" "Vulkan0","REPEAT_BACK","type=f32,ne=[8,6,4,2],nr=[1,1,1,1],v=0","support","1","yes","Vulkan" "Vulkan0","REPEAT_BACK","type=f32,ne=[8,6,4,2],nr=[2,1,1,1],v=0","support","1","yes","Vulkan" "Vulkan0","REPEAT_BACK","type=f32,ne=[8,6,4,2],nr=[1,2,1,1],v=0","support","1","yes","Vulkan" @@ -5069,415 +5110,547 @@ "Vulkan0","SET","type_src=i32,type_dst=i32,ne=[6,5,4,3],dim=2,inplace=1","support","1","yes","Vulkan" "Vulkan0","SET","type_src=i32,type_dst=i32,ne=[6,5,4,3],dim=3,inplace=0","support","1","yes","Vulkan" "Vulkan0","SET","type_src=i32,type_dst=i32,ne=[6,5,4,3],dim=3,inplace=1","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=f32,ne=[1,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=f32,ne=[1,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=f32,ne=[1,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=f32,ne=[2,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=f32,ne=[2,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=f32,ne=[2,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=f32,ne=[3,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=f32,ne=[3,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=f32,ne=[3,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=f16,ne=[1,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=f16,ne=[1,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=f16,ne=[1,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=f16,ne=[2,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=f16,ne=[2,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=f16,ne=[2,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=f16,ne=[3,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=f16,ne=[3,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=f16,ne=[3,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[1,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[1,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[1,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[2,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[2,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[2,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[3,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[3,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[3,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=q4_0,type_dst=q4_0,ne=[32,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=q4_0,type_dst=q4_0,ne=[32,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q4_0,type_dst=q4_0,ne=[32,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q4_0,type_dst=q4_0,ne=[64,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=q4_0,type_dst=q4_0,ne=[64,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q4_0,type_dst=q4_0,ne=[64,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q4_0,type_dst=q4_0,ne=[96,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=q4_0,type_dst=q4_0,ne=[96,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q4_0,type_dst=q4_0,ne=[96,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q4_1,type_dst=q4_1,ne=[32,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=q4_1,type_dst=q4_1,ne=[32,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q4_1,type_dst=q4_1,ne=[32,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q4_1,type_dst=q4_1,ne=[64,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=q4_1,type_dst=q4_1,ne=[64,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q4_1,type_dst=q4_1,ne=[64,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q4_1,type_dst=q4_1,ne=[96,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=q4_1,type_dst=q4_1,ne=[96,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q4_1,type_dst=q4_1,ne=[96,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q5_0,type_dst=q5_0,ne=[32,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=q5_0,type_dst=q5_0,ne=[32,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q5_0,type_dst=q5_0,ne=[32,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q5_0,type_dst=q5_0,ne=[64,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=q5_0,type_dst=q5_0,ne=[64,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q5_0,type_dst=q5_0,ne=[64,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q5_0,type_dst=q5_0,ne=[96,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=q5_0,type_dst=q5_0,ne=[96,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q5_0,type_dst=q5_0,ne=[96,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q5_1,type_dst=q5_1,ne=[32,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=q5_1,type_dst=q5_1,ne=[32,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q5_1,type_dst=q5_1,ne=[32,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q5_1,type_dst=q5_1,ne=[64,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=q5_1,type_dst=q5_1,ne=[64,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q5_1,type_dst=q5_1,ne=[64,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q5_1,type_dst=q5_1,ne=[96,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=q5_1,type_dst=q5_1,ne=[96,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q5_1,type_dst=q5_1,ne=[96,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q8_0,type_dst=q8_0,ne=[32,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=q8_0,type_dst=q8_0,ne=[32,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q8_0,type_dst=q8_0,ne=[32,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q8_0,type_dst=q8_0,ne=[64,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=q8_0,type_dst=q8_0,ne=[64,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q8_0,type_dst=q8_0,ne=[64,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q8_0,type_dst=q8_0,ne=[96,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=q8_0,type_dst=q8_0,ne=[96,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q8_0,type_dst=q8_0,ne=[96,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=mxfp4,type_dst=mxfp4,ne=[32,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=mxfp4,type_dst=mxfp4,ne=[32,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=mxfp4,type_dst=mxfp4,ne=[32,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=mxfp4,type_dst=mxfp4,ne=[64,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=mxfp4,type_dst=mxfp4,ne=[64,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=mxfp4,type_dst=mxfp4,ne=[64,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=mxfp4,type_dst=mxfp4,ne=[96,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=mxfp4,type_dst=mxfp4,ne=[96,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=mxfp4,type_dst=mxfp4,ne=[96,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q2_K,type_dst=q2_K,ne=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=q2_K,type_dst=q2_K,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q2_K,type_dst=q2_K,ne=[256,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q2_K,type_dst=q2_K,ne=[512,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=q2_K,type_dst=q2_K,ne=[512,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q2_K,type_dst=q2_K,ne=[512,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q2_K,type_dst=q2_K,ne=[768,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=q2_K,type_dst=q2_K,ne=[768,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q2_K,type_dst=q2_K,ne=[768,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q3_K,type_dst=q3_K,ne=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=q3_K,type_dst=q3_K,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q3_K,type_dst=q3_K,ne=[256,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q3_K,type_dst=q3_K,ne=[512,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=q3_K,type_dst=q3_K,ne=[512,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q3_K,type_dst=q3_K,ne=[512,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q3_K,type_dst=q3_K,ne=[768,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=q3_K,type_dst=q3_K,ne=[768,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q3_K,type_dst=q3_K,ne=[768,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q4_K,type_dst=q4_K,ne=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=q4_K,type_dst=q4_K,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q4_K,type_dst=q4_K,ne=[256,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q4_K,type_dst=q4_K,ne=[512,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=q4_K,type_dst=q4_K,ne=[512,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q4_K,type_dst=q4_K,ne=[512,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q4_K,type_dst=q4_K,ne=[768,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=q4_K,type_dst=q4_K,ne=[768,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q4_K,type_dst=q4_K,ne=[768,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q5_K,type_dst=q5_K,ne=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=q5_K,type_dst=q5_K,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q5_K,type_dst=q5_K,ne=[256,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q5_K,type_dst=q5_K,ne=[512,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=q5_K,type_dst=q5_K,ne=[512,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q5_K,type_dst=q5_K,ne=[512,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q5_K,type_dst=q5_K,ne=[768,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=q5_K,type_dst=q5_K,ne=[768,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q5_K,type_dst=q5_K,ne=[768,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q6_K,type_dst=q6_K,ne=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=q6_K,type_dst=q6_K,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q6_K,type_dst=q6_K,ne=[256,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q6_K,type_dst=q6_K,ne=[512,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=q6_K,type_dst=q6_K,ne=[512,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q6_K,type_dst=q6_K,ne=[512,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q6_K,type_dst=q6_K,ne=[768,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=q6_K,type_dst=q6_K,ne=[768,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q6_K,type_dst=q6_K,ne=[768,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq2_xxs,type_dst=iq2_xxs,ne=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=iq2_xxs,type_dst=iq2_xxs,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq2_xxs,type_dst=iq2_xxs,ne=[256,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq2_xxs,type_dst=iq2_xxs,ne=[512,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=iq2_xxs,type_dst=iq2_xxs,ne=[512,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq2_xxs,type_dst=iq2_xxs,ne=[512,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq2_xxs,type_dst=iq2_xxs,ne=[768,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=iq2_xxs,type_dst=iq2_xxs,ne=[768,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq2_xxs,type_dst=iq2_xxs,ne=[768,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq2_xs,type_dst=iq2_xs,ne=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=iq2_xs,type_dst=iq2_xs,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq2_xs,type_dst=iq2_xs,ne=[256,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq2_xs,type_dst=iq2_xs,ne=[512,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=iq2_xs,type_dst=iq2_xs,ne=[512,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq2_xs,type_dst=iq2_xs,ne=[512,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq2_xs,type_dst=iq2_xs,ne=[768,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=iq2_xs,type_dst=iq2_xs,ne=[768,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq2_xs,type_dst=iq2_xs,ne=[768,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq2_s,type_dst=iq2_s,ne=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=iq2_s,type_dst=iq2_s,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq2_s,type_dst=iq2_s,ne=[256,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq2_s,type_dst=iq2_s,ne=[512,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=iq2_s,type_dst=iq2_s,ne=[512,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq2_s,type_dst=iq2_s,ne=[512,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq2_s,type_dst=iq2_s,ne=[768,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=iq2_s,type_dst=iq2_s,ne=[768,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq2_s,type_dst=iq2_s,ne=[768,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq3_xxs,type_dst=iq3_xxs,ne=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=iq3_xxs,type_dst=iq3_xxs,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq3_xxs,type_dst=iq3_xxs,ne=[256,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq3_xxs,type_dst=iq3_xxs,ne=[512,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=iq3_xxs,type_dst=iq3_xxs,ne=[512,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq3_xxs,type_dst=iq3_xxs,ne=[512,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq3_xxs,type_dst=iq3_xxs,ne=[768,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=iq3_xxs,type_dst=iq3_xxs,ne=[768,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq3_xxs,type_dst=iq3_xxs,ne=[768,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq1_s,type_dst=iq1_s,ne=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=iq1_s,type_dst=iq1_s,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq1_s,type_dst=iq1_s,ne=[256,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq1_s,type_dst=iq1_s,ne=[512,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=iq1_s,type_dst=iq1_s,ne=[512,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq1_s,type_dst=iq1_s,ne=[512,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq1_s,type_dst=iq1_s,ne=[768,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=iq1_s,type_dst=iq1_s,ne=[768,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq1_s,type_dst=iq1_s,ne=[768,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq1_m,type_dst=iq1_m,ne=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=iq1_m,type_dst=iq1_m,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq1_m,type_dst=iq1_m,ne=[256,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq1_m,type_dst=iq1_m,ne=[512,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=iq1_m,type_dst=iq1_m,ne=[512,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq1_m,type_dst=iq1_m,ne=[512,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq1_m,type_dst=iq1_m,ne=[768,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=iq1_m,type_dst=iq1_m,ne=[768,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq1_m,type_dst=iq1_m,ne=[768,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq4_nl,type_dst=iq4_nl,ne=[32,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=iq4_nl,type_dst=iq4_nl,ne=[32,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq4_nl,type_dst=iq4_nl,ne=[32,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq4_nl,type_dst=iq4_nl,ne=[64,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=iq4_nl,type_dst=iq4_nl,ne=[64,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq4_nl,type_dst=iq4_nl,ne=[64,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq4_nl,type_dst=iq4_nl,ne=[96,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=iq4_nl,type_dst=iq4_nl,ne=[96,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq4_nl,type_dst=iq4_nl,ne=[96,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq3_s,type_dst=iq3_s,ne=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=iq3_s,type_dst=iq3_s,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq3_s,type_dst=iq3_s,ne=[256,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq3_s,type_dst=iq3_s,ne=[512,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=iq3_s,type_dst=iq3_s,ne=[512,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq3_s,type_dst=iq3_s,ne=[512,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq3_s,type_dst=iq3_s,ne=[768,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=iq3_s,type_dst=iq3_s,ne=[768,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq3_s,type_dst=iq3_s,ne=[768,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq4_xs,type_dst=iq4_xs,ne=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=iq4_xs,type_dst=iq4_xs,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq4_xs,type_dst=iq4_xs,ne=[256,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq4_xs,type_dst=iq4_xs,ne=[512,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=iq4_xs,type_dst=iq4_xs,ne=[512,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq4_xs,type_dst=iq4_xs,ne=[512,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq4_xs,type_dst=iq4_xs,ne=[768,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=iq4_xs,type_dst=iq4_xs,ne=[768,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq4_xs,type_dst=iq4_xs,ne=[768,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=f32,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=f32,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=f16,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=f16,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=bf16,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=bf16,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=q4_0,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=q4_0,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=q4_1,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=q4_1,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=q5_0,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=q5_0,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=q5_1,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=q5_1,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=q8_0,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=q8_0,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=mxfp4,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=mxfp4,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=q2_K,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=q2_K,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=q3_K,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=q3_K,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=q4_K,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=q4_K,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=q5_K,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=q5_K,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=q6_K,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=q6_K,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=iq2_xxs,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=iq2_xxs,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=iq2_xs,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=iq2_xs,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=iq2_s,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=iq2_s,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=iq3_xxs,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=iq3_xxs,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=iq1_s,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=iq1_s,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=iq1_m,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=iq1_m,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=iq4_nl,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=iq4_nl,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=iq3_s,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=iq3_s,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=iq4_xs,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=iq4_xs,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=f32,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=f32,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=f16,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=f16,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=q4_0,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=q4_0,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=q4_1,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=q4_1,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=q5_0,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=q5_0,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=q5_1,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=q5_1,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=q8_0,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=q8_0,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=mxfp4,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=mxfp4,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=q2_K,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=q2_K,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=q3_K,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=q3_K,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=q4_K,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=q4_K,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=q5_K,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=q5_K,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=q6_K,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=q6_K,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=iq2_xxs,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=iq2_xxs,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=iq2_xs,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=iq2_xs,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=iq2_s,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=iq2_s,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=iq3_xxs,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=iq3_xxs,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=iq1_s,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=iq1_s,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=iq1_m,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=iq1_m,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=iq4_nl,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=iq4_nl,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=iq3_s,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=iq3_s,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=iq4_xs,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=iq4_xs,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=f32,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=f32,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=f16,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=f16,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=bf16,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=bf16,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=q4_0,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=q4_0,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=q4_1,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=q4_1,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=q5_0,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=q5_0,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=q5_1,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=q5_1,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=q8_0,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=q8_0,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=mxfp4,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=mxfp4,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=q2_K,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=q2_K,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=q3_K,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=q3_K,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=q4_K,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=q4_K,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=q5_K,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=q5_K,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=q6_K,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=q6_K,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=iq2_xxs,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=iq2_xxs,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=iq2_xs,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=iq2_xs,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=iq2_s,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=iq2_s,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=iq3_xxs,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=iq3_xxs,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=iq1_s,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=iq1_s,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=iq1_m,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=iq1_m,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=iq4_nl,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=iq4_nl,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=iq3_s,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=iq3_s,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=iq4_xs,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=iq4_xs,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=f32,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=f32,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=f32,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=f32,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=f32,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=f32,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q4_0,type_dst=f32,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=q4_0,type_dst=f32,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=q4_1,type_dst=f32,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=q4_1,type_dst=f32,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=q5_0,type_dst=f32,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=q5_0,type_dst=f32,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=q5_1,type_dst=f32,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=q5_1,type_dst=f32,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=q8_0,type_dst=f32,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=q8_0,type_dst=f32,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=mxfp4,type_dst=f32,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=mxfp4,type_dst=f32,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q2_K,type_dst=f32,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q2_K,type_dst=f32,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q3_K,type_dst=f32,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q3_K,type_dst=f32,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q4_K,type_dst=f32,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q4_K,type_dst=f32,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q5_K,type_dst=f32,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q5_K,type_dst=f32,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q6_K,type_dst=f32,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=q6_K,type_dst=f32,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq2_xxs,type_dst=f32,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq2_xxs,type_dst=f32,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq2_xs,type_dst=f32,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq2_xs,type_dst=f32,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq2_s,type_dst=f32,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq2_s,type_dst=f32,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq3_xxs,type_dst=f32,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq3_xxs,type_dst=f32,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq1_s,type_dst=f32,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq1_s,type_dst=f32,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq1_m,type_dst=f32,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq1_m,type_dst=f32,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq4_nl,type_dst=f32,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=iq4_nl,type_dst=f32,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=iq3_s,type_dst=f32,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq3_s,type_dst=f32,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq4_xs,type_dst=f32,ne=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=iq4_xs,type_dst=f32,ne=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=f16,ne=[256,2,3,4],permute_src=[1,0,2,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=f32,ne=[256,2,3,4],permute_src=[1,0,2,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=f16,ne=[256,2,3,4],permute_src=[1,0,2,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=f32,ne=[256,2,3,4],permute_src=[1,0,2,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=i32,ne=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=i32,ne=[256,2,3,4],permute_src=[1,0,2,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=i32,type_dst=f32,ne=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=i32,type_dst=f32,ne=[256,2,3,4],permute_src=[1,0,2,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=f16,ne=[256,4,3,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=f32,ne=[256,4,3,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=f32,ne=[256,4,3,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[256,4,3,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=f16,type_dst=f16,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=f32,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=i32,type_dst=i32,ne=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=i32,type_dst=i32,ne=[256,1,4,1],permute_src=[1,2,0,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" -"Vulkan0","CPY","type_src=f32,type_dst=f32,ne=[256,1,4,1],permute_src=[1,2,0,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[1,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[1,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[1,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[2,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[2,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[2,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[3,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[3,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[3,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[1,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[1,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[1,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[2,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[2,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[2,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[3,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[3,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[3,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne_src=[1,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne_src=[1,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne_src=[1,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne_src=[2,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne_src=[2,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne_src=[2,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne_src=[3,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne_src=[3,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne_src=[3,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=q4_0,type_dst=q4_0,ne_src=[32,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=q4_0,type_dst=q4_0,ne_src=[32,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q4_0,type_dst=q4_0,ne_src=[32,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q4_0,type_dst=q4_0,ne_src=[64,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=q4_0,type_dst=q4_0,ne_src=[64,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q4_0,type_dst=q4_0,ne_src=[64,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q4_0,type_dst=q4_0,ne_src=[96,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=q4_0,type_dst=q4_0,ne_src=[96,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q4_0,type_dst=q4_0,ne_src=[96,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q4_1,type_dst=q4_1,ne_src=[32,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=q4_1,type_dst=q4_1,ne_src=[32,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q4_1,type_dst=q4_1,ne_src=[32,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q4_1,type_dst=q4_1,ne_src=[64,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=q4_1,type_dst=q4_1,ne_src=[64,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q4_1,type_dst=q4_1,ne_src=[64,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q4_1,type_dst=q4_1,ne_src=[96,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=q4_1,type_dst=q4_1,ne_src=[96,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q4_1,type_dst=q4_1,ne_src=[96,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q5_0,type_dst=q5_0,ne_src=[32,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=q5_0,type_dst=q5_0,ne_src=[32,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q5_0,type_dst=q5_0,ne_src=[32,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q5_0,type_dst=q5_0,ne_src=[64,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=q5_0,type_dst=q5_0,ne_src=[64,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q5_0,type_dst=q5_0,ne_src=[64,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q5_0,type_dst=q5_0,ne_src=[96,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=q5_0,type_dst=q5_0,ne_src=[96,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q5_0,type_dst=q5_0,ne_src=[96,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q5_1,type_dst=q5_1,ne_src=[32,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=q5_1,type_dst=q5_1,ne_src=[32,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q5_1,type_dst=q5_1,ne_src=[32,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q5_1,type_dst=q5_1,ne_src=[64,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=q5_1,type_dst=q5_1,ne_src=[64,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q5_1,type_dst=q5_1,ne_src=[64,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q5_1,type_dst=q5_1,ne_src=[96,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=q5_1,type_dst=q5_1,ne_src=[96,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q5_1,type_dst=q5_1,ne_src=[96,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q8_0,type_dst=q8_0,ne_src=[32,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=q8_0,type_dst=q8_0,ne_src=[32,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q8_0,type_dst=q8_0,ne_src=[32,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q8_0,type_dst=q8_0,ne_src=[64,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=q8_0,type_dst=q8_0,ne_src=[64,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q8_0,type_dst=q8_0,ne_src=[64,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q8_0,type_dst=q8_0,ne_src=[96,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=q8_0,type_dst=q8_0,ne_src=[96,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q8_0,type_dst=q8_0,ne_src=[96,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q1_0,type_dst=q1_0,ne_src=[128,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=q1_0,type_dst=q1_0,ne_src=[128,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q1_0,type_dst=q1_0,ne_src=[128,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q1_0,type_dst=q1_0,ne_src=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=q1_0,type_dst=q1_0,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q1_0,type_dst=q1_0,ne_src=[256,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q1_0,type_dst=q1_0,ne_src=[384,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=q1_0,type_dst=q1_0,ne_src=[384,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q1_0,type_dst=q1_0,ne_src=[384,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=mxfp4,type_dst=mxfp4,ne_src=[32,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=mxfp4,type_dst=mxfp4,ne_src=[32,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=mxfp4,type_dst=mxfp4,ne_src=[32,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=mxfp4,type_dst=mxfp4,ne_src=[64,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=mxfp4,type_dst=mxfp4,ne_src=[64,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=mxfp4,type_dst=mxfp4,ne_src=[64,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=mxfp4,type_dst=mxfp4,ne_src=[96,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=mxfp4,type_dst=mxfp4,ne_src=[96,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=mxfp4,type_dst=mxfp4,ne_src=[96,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=nvfp4,type_dst=nvfp4,ne_src=[64,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=nvfp4,type_dst=nvfp4,ne_src=[64,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=nvfp4,type_dst=nvfp4,ne_src=[64,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=nvfp4,type_dst=nvfp4,ne_src=[128,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=nvfp4,type_dst=nvfp4,ne_src=[128,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=nvfp4,type_dst=nvfp4,ne_src=[128,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=nvfp4,type_dst=nvfp4,ne_src=[192,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=nvfp4,type_dst=nvfp4,ne_src=[192,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=nvfp4,type_dst=nvfp4,ne_src=[192,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q2_K,type_dst=q2_K,ne_src=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=q2_K,type_dst=q2_K,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q2_K,type_dst=q2_K,ne_src=[256,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q2_K,type_dst=q2_K,ne_src=[512,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=q2_K,type_dst=q2_K,ne_src=[512,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q2_K,type_dst=q2_K,ne_src=[512,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q2_K,type_dst=q2_K,ne_src=[768,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=q2_K,type_dst=q2_K,ne_src=[768,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q2_K,type_dst=q2_K,ne_src=[768,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q3_K,type_dst=q3_K,ne_src=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=q3_K,type_dst=q3_K,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q3_K,type_dst=q3_K,ne_src=[256,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q3_K,type_dst=q3_K,ne_src=[512,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=q3_K,type_dst=q3_K,ne_src=[512,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q3_K,type_dst=q3_K,ne_src=[512,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q3_K,type_dst=q3_K,ne_src=[768,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=q3_K,type_dst=q3_K,ne_src=[768,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q3_K,type_dst=q3_K,ne_src=[768,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q4_K,type_dst=q4_K,ne_src=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=q4_K,type_dst=q4_K,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q4_K,type_dst=q4_K,ne_src=[256,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q4_K,type_dst=q4_K,ne_src=[512,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=q4_K,type_dst=q4_K,ne_src=[512,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q4_K,type_dst=q4_K,ne_src=[512,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q4_K,type_dst=q4_K,ne_src=[768,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=q4_K,type_dst=q4_K,ne_src=[768,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q4_K,type_dst=q4_K,ne_src=[768,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q5_K,type_dst=q5_K,ne_src=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=q5_K,type_dst=q5_K,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q5_K,type_dst=q5_K,ne_src=[256,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q5_K,type_dst=q5_K,ne_src=[512,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=q5_K,type_dst=q5_K,ne_src=[512,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q5_K,type_dst=q5_K,ne_src=[512,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q5_K,type_dst=q5_K,ne_src=[768,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=q5_K,type_dst=q5_K,ne_src=[768,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q5_K,type_dst=q5_K,ne_src=[768,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q6_K,type_dst=q6_K,ne_src=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=q6_K,type_dst=q6_K,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q6_K,type_dst=q6_K,ne_src=[256,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q6_K,type_dst=q6_K,ne_src=[512,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=q6_K,type_dst=q6_K,ne_src=[512,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q6_K,type_dst=q6_K,ne_src=[512,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q6_K,type_dst=q6_K,ne_src=[768,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=q6_K,type_dst=q6_K,ne_src=[768,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q6_K,type_dst=q6_K,ne_src=[768,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq2_xxs,type_dst=iq2_xxs,ne_src=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=iq2_xxs,type_dst=iq2_xxs,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq2_xxs,type_dst=iq2_xxs,ne_src=[256,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq2_xxs,type_dst=iq2_xxs,ne_src=[512,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=iq2_xxs,type_dst=iq2_xxs,ne_src=[512,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq2_xxs,type_dst=iq2_xxs,ne_src=[512,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq2_xxs,type_dst=iq2_xxs,ne_src=[768,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=iq2_xxs,type_dst=iq2_xxs,ne_src=[768,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq2_xxs,type_dst=iq2_xxs,ne_src=[768,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq2_xs,type_dst=iq2_xs,ne_src=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=iq2_xs,type_dst=iq2_xs,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq2_xs,type_dst=iq2_xs,ne_src=[256,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq2_xs,type_dst=iq2_xs,ne_src=[512,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=iq2_xs,type_dst=iq2_xs,ne_src=[512,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq2_xs,type_dst=iq2_xs,ne_src=[512,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq2_xs,type_dst=iq2_xs,ne_src=[768,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=iq2_xs,type_dst=iq2_xs,ne_src=[768,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq2_xs,type_dst=iq2_xs,ne_src=[768,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq2_s,type_dst=iq2_s,ne_src=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=iq2_s,type_dst=iq2_s,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq2_s,type_dst=iq2_s,ne_src=[256,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq2_s,type_dst=iq2_s,ne_src=[512,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=iq2_s,type_dst=iq2_s,ne_src=[512,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq2_s,type_dst=iq2_s,ne_src=[512,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq2_s,type_dst=iq2_s,ne_src=[768,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=iq2_s,type_dst=iq2_s,ne_src=[768,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq2_s,type_dst=iq2_s,ne_src=[768,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq3_xxs,type_dst=iq3_xxs,ne_src=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=iq3_xxs,type_dst=iq3_xxs,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq3_xxs,type_dst=iq3_xxs,ne_src=[256,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq3_xxs,type_dst=iq3_xxs,ne_src=[512,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=iq3_xxs,type_dst=iq3_xxs,ne_src=[512,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq3_xxs,type_dst=iq3_xxs,ne_src=[512,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq3_xxs,type_dst=iq3_xxs,ne_src=[768,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=iq3_xxs,type_dst=iq3_xxs,ne_src=[768,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq3_xxs,type_dst=iq3_xxs,ne_src=[768,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq1_s,type_dst=iq1_s,ne_src=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=iq1_s,type_dst=iq1_s,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq1_s,type_dst=iq1_s,ne_src=[256,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq1_s,type_dst=iq1_s,ne_src=[512,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=iq1_s,type_dst=iq1_s,ne_src=[512,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq1_s,type_dst=iq1_s,ne_src=[512,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq1_s,type_dst=iq1_s,ne_src=[768,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=iq1_s,type_dst=iq1_s,ne_src=[768,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq1_s,type_dst=iq1_s,ne_src=[768,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq1_m,type_dst=iq1_m,ne_src=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=iq1_m,type_dst=iq1_m,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq1_m,type_dst=iq1_m,ne_src=[256,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq1_m,type_dst=iq1_m,ne_src=[512,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=iq1_m,type_dst=iq1_m,ne_src=[512,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq1_m,type_dst=iq1_m,ne_src=[512,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq1_m,type_dst=iq1_m,ne_src=[768,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=iq1_m,type_dst=iq1_m,ne_src=[768,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq1_m,type_dst=iq1_m,ne_src=[768,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq4_nl,type_dst=iq4_nl,ne_src=[32,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=iq4_nl,type_dst=iq4_nl,ne_src=[32,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq4_nl,type_dst=iq4_nl,ne_src=[32,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq4_nl,type_dst=iq4_nl,ne_src=[64,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=iq4_nl,type_dst=iq4_nl,ne_src=[64,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq4_nl,type_dst=iq4_nl,ne_src=[64,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq4_nl,type_dst=iq4_nl,ne_src=[96,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=iq4_nl,type_dst=iq4_nl,ne_src=[96,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq4_nl,type_dst=iq4_nl,ne_src=[96,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq3_s,type_dst=iq3_s,ne_src=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=iq3_s,type_dst=iq3_s,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq3_s,type_dst=iq3_s,ne_src=[256,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq3_s,type_dst=iq3_s,ne_src=[512,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=iq3_s,type_dst=iq3_s,ne_src=[512,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq3_s,type_dst=iq3_s,ne_src=[512,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq3_s,type_dst=iq3_s,ne_src=[768,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=iq3_s,type_dst=iq3_s,ne_src=[768,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq3_s,type_dst=iq3_s,ne_src=[768,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq4_xs,type_dst=iq4_xs,ne_src=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=iq4_xs,type_dst=iq4_xs,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq4_xs,type_dst=iq4_xs,ne_src=[256,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq4_xs,type_dst=iq4_xs,ne_src=[512,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=iq4_xs,type_dst=iq4_xs,ne_src=[512,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq4_xs,type_dst=iq4_xs,ne_src=[512,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq4_xs,type_dst=iq4_xs,ne_src=[768,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=iq4_xs,type_dst=iq4_xs,ne_src=[768,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq4_xs,type_dst=iq4_xs,ne_src=[768,2,3,4],permute_src=[0,3,1,2],permute_dst=[0,2,1,3],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f32,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f32,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=bf16,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=bf16,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=q4_0,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=q4_0,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=q4_1,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=q4_1,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=q5_0,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=q5_0,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=q5_1,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=q5_1,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=q8_0,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=q8_0,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=q1_0,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=q1_0,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=mxfp4,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=mxfp4,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=nvfp4,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=nvfp4,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=q2_K,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=q2_K,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=q3_K,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=q3_K,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=q4_K,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=q4_K,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=q5_K,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=q5_K,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=q6_K,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=q6_K,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=iq2_xxs,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=iq2_xxs,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=iq2_xs,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=iq2_xs,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=iq2_s,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=iq2_s,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=iq3_xxs,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=iq3_xxs,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=iq1_s,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=iq1_s,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=iq1_m,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=iq1_m,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=iq4_nl,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=iq4_nl,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=iq3_s,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=iq3_s,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=iq4_xs,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=iq4_xs,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=f32,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=f32,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=f16,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=f16,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=q4_0,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=q4_0,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=q4_1,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=q4_1,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=q5_0,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=q5_0,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=q5_1,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=q5_1,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=q8_0,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=q8_0,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=q1_0,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=q1_0,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=mxfp4,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=mxfp4,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=nvfp4,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=nvfp4,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=q2_K,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=q2_K,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=q3_K,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=q3_K,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=q4_K,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=q4_K,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=q5_K,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=q5_K,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=q6_K,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=q6_K,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=iq2_xxs,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=iq2_xxs,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=iq2_xs,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=iq2_xs,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=iq2_s,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=iq2_s,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=iq3_xxs,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=iq3_xxs,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=iq1_s,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=iq1_s,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=iq1_m,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=iq1_m,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=iq4_nl,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=iq4_nl,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=iq3_s,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=iq3_s,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=iq4_xs,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=iq4_xs,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f16,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f16,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=bf16,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=bf16,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=q4_0,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=q4_0,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=q4_1,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=q4_1,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=q5_0,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=q5_0,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=q5_1,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=q5_1,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=q8_0,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=q8_0,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=q1_0,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=q1_0,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=mxfp4,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=mxfp4,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=nvfp4,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=nvfp4,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=q2_K,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=q2_K,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=q3_K,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=q3_K,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=q4_K,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=q4_K,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=q5_K,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=q5_K,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=q6_K,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=q6_K,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=iq2_xxs,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=iq2_xxs,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=iq2_xs,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=iq2_xs,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=iq2_s,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=iq2_s,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=iq3_xxs,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=iq3_xxs,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=iq1_s,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=iq1_s,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=iq1_m,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=iq1_m,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=iq4_nl,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=iq4_nl,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=iq3_s,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=iq3_s,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=iq4_xs,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=iq4_xs,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f32,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f32,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=f32,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=f32,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=q4_0,type_dst=f32,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=q4_0,type_dst=f32,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=q4_1,type_dst=f32,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=q4_1,type_dst=f32,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=q5_0,type_dst=f32,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=q5_0,type_dst=f32,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=q5_1,type_dst=f32,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=q5_1,type_dst=f32,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=q8_0,type_dst=f32,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=q8_0,type_dst=f32,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=q1_0,type_dst=f32,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=q1_0,type_dst=f32,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=mxfp4,type_dst=f32,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=mxfp4,type_dst=f32,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=nvfp4,type_dst=f32,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=nvfp4,type_dst=f32,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q2_K,type_dst=f32,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q2_K,type_dst=f32,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q3_K,type_dst=f32,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q3_K,type_dst=f32,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q4_K,type_dst=f32,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q4_K,type_dst=f32,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q5_K,type_dst=f32,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q5_K,type_dst=f32,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q6_K,type_dst=f32,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=q6_K,type_dst=f32,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq2_xxs,type_dst=f32,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq2_xxs,type_dst=f32,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq2_xs,type_dst=f32,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq2_xs,type_dst=f32,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq2_s,type_dst=f32,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq2_s,type_dst=f32,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq3_xxs,type_dst=f32,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq3_xxs,type_dst=f32,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq1_s,type_dst=f32,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq1_s,type_dst=f32,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq1_m,type_dst=f32,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq1_m,type_dst=f32,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq4_nl,type_dst=f32,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=iq4_nl,type_dst=f32,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=iq3_s,type_dst=f32,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq3_s,type_dst=f32,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq4_xs,type_dst=f32,ne_src=[256,4,4,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=iq4_xs,type_dst=f32,ne_src=[256,2,3,4],permute_src=[0,2,1,3],permute_dst=[0,0,0,0],_src_transpose=0","support","0","no","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[256,2,3,4],permute_src=[1,0,2,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f32,ne_src=[256,2,3,4],permute_src=[1,0,2,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f16,ne_src=[256,2,3,4],permute_src=[1,0,2,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[256,2,3,4],permute_src=[1,0,2,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=i32,ne_src=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=i32,ne_src=[256,2,3,4],permute_src=[1,0,2,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=i32,type_dst=f32,ne_src=[256,2,3,4],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=i32,type_dst=f32,ne_src=[256,2,3,4],permute_src=[1,0,2,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[256,4,3,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[256,4,3,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[256,4,3,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne_src=[256,4,3,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=bf16,type_dst=bf16,ne_src=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=i32,type_dst=i32,ne_src=[256,4,1,1],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=1","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=i32,type_dst=i32,ne_src=[256,1,4,1],permute_src=[1,2,0,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[256,1,4,1],permute_src=[1,2,0,3],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[3,5,7,32],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[3,5,32,7],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[3,5,32,7],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[3,7,5,32],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[3,7,5,32],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[3,7,32,5],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[3,7,32,5],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[3,32,5,7],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[3,32,5,7],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[3,32,7,5],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[3,32,7,5],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[5,3,7,32],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[5,3,7,32],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[5,3,32,7],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[5,3,32,7],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[5,7,3,32],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[5,7,3,32],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[5,7,32,3],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[5,7,32,3],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[5,32,3,7],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[5,32,3,7],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[5,32,7,3],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[5,32,7,3],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[7,3,5,32],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[7,3,5,32],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[7,3,32,5],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[7,3,32,5],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[7,5,3,32],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[7,5,3,32],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[7,5,32,3],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[7,5,32,3],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[7,32,3,5],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[7,32,3,5],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[7,32,5,3],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[7,32,5,3],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[32,3,5,7],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[32,3,5,7],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=q4_0,type_dst=q4_0,ne_src=[32,3,5,7],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[32,3,7,5],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[32,3,7,5],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=q4_0,type_dst=q4_0,ne_src=[32,3,7,5],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[32,5,3,7],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[32,5,3,7],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=q4_0,type_dst=q4_0,ne_src=[32,5,3,7],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[32,5,7,3],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[32,5,7,3],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=q4_0,type_dst=q4_0,ne_src=[32,5,7,3],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[32,7,3,5],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[32,7,3,5],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=q4_0,type_dst=q4_0,ne_src=[32,7,3,5],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f32,type_dst=f32,ne_src=[32,7,5,3],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=q4_0,type_dst=q4_0,ne_src=[32,7,5,3],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[3,5,7,32],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[3,5,32,7],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[3,5,32,7],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[3,7,5,32],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[3,7,5,32],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[3,7,32,5],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[3,7,32,5],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[3,32,5,7],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[3,32,5,7],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[3,32,7,5],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[3,32,7,5],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[5,3,7,32],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[5,3,7,32],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[5,3,32,7],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[5,3,32,7],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[5,7,3,32],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[5,7,3,32],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[5,7,32,3],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[5,7,32,3],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[5,32,3,7],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[5,32,3,7],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[5,32,7,3],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[5,32,7,3],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[7,3,5,32],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[7,3,5,32],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[7,3,32,5],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[7,3,32,5],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[7,5,3,32],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[7,5,3,32],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[7,5,32,3],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[7,5,32,3],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[7,32,3,5],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[7,32,3,5],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[7,32,5,3],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[7,32,5,3],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[32,3,5,7],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[32,3,5,7],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[32,3,7,5],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[32,3,7,5],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[32,5,3,7],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[32,5,3,7],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[32,5,7,3],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[32,5,7,3],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[32,7,3,5],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[32,7,3,5],ne_dst=[32,7,5,3],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" +"Vulkan0","CPY","type_src=f16,type_dst=f16,ne_src=[32,7,5,3],ne_dst=[3,5,7,32],permute_src=[0,0,0,0],permute_dst=[0,0,0,0],_src_transpose=0","support","1","yes","Vulkan" "Vulkan0","CONT","type=f32,ne=[2,1,1,1],use_view_slice=1","support","1","yes","Vulkan" "Vulkan0","CONT","type=f32,ne=[2,1,3,5],use_view_slice=1","support","1","yes","Vulkan" "Vulkan0","CONT","type=f32,ne=[2,3,5,7],use_view_slice=1","support","1","yes","Vulkan" @@ -5514,368 +5687,366 @@ "Vulkan0","CONT","type=bf16,ne=[1,4,4,1],use_view_slice=0","support","1","yes","Vulkan" "Vulkan0","CONT","type=bf16,ne=[1,8,17,1],use_view_slice=0","support","1","yes","Vulkan" "Vulkan0","CONT","type=bf16,ne=[10,10,10,1],use_view_slice=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f16,ne=[1,1,1,1],nr=[32,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f16,ne=[1,1,1,1],nr=[32,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f16,ne=[1,1,1,1],nr=[32,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f16,ne=[1,1,1,1],nr=[32,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f16,ne=[1,1,320,320],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f16,ne=[1,1,320,320],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f16,ne=[1,1,320,320],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f16,ne=[1,1,320,320],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f16,ne=[10,5,1,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f16,ne=[10,5,1,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f16,ne=[10,5,1,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f16,ne=[10,5,1,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f16,ne=[10,5,4,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f16,ne=[10,5,4,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f16,ne=[10,5,4,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f16,ne=[10,5,4,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f16,ne=[10,5,4,3],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f16,ne=[10,5,4,3],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f16,ne=[10,5,4,3],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f16,ne=[10,5,4,3],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f16,ne=[10,5,4,3],nr=[2,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f16,ne=[10,5,4,3],nr=[2,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f16,ne=[10,5,4,3],nr=[2,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f16,ne=[10,5,4,3],nr=[2,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f16,ne=[10,5,4,3],nr=[1,2,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f16,ne=[10,5,4,3],nr=[1,2,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f16,ne=[10,5,4,3],nr=[1,2,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f16,ne=[10,5,4,3],nr=[1,2,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f16,ne=[10,5,4,3],nr=[1,1,2,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f16,ne=[10,5,4,3],nr=[1,1,2,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f16,ne=[10,5,4,3],nr=[1,1,2,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f16,ne=[10,5,4,3],nr=[1,1,2,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f16,ne=[10,5,4,3],nr=[1,1,1,2],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f16,ne=[10,5,4,3],nr=[1,1,1,2],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f16,ne=[10,5,4,3],nr=[1,1,1,2],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f16,ne=[10,5,4,3],nr=[1,1,1,2],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f16,ne=[10,5,4,3],nr=[1,1,2,2],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f16,ne=[10,5,4,3],nr=[1,1,2,2],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f16,ne=[10,5,4,3],nr=[1,1,2,2],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f16,ne=[10,5,4,3],nr=[1,1,2,2],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f16,ne=[10,5,4,3],nr=[1,2,2,2],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f16,ne=[10,5,4,3],nr=[1,2,2,2],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f16,ne=[10,5,4,3],nr=[1,2,2,2],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f16,ne=[10,5,4,3],nr=[1,2,2,2],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f16,ne=[10,5,4,3],nr=[2,2,2,2],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f16,ne=[10,5,4,3],nr=[2,2,2,2],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f16,ne=[10,5,4,3],nr=[2,2,2,2],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f16,ne=[10,5,4,3],nr=[2,2,2,2],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f16,ne=[1,1,1,1],nr=[32,1,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f16,ne=[1,1,1,1],nr=[32,1,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f16,ne=[1,1,1,1],nr=[32,1,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f16,ne=[1,1,1,1],nr=[32,1,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f16,ne=[1,1,320,320],nr=[1,1,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f16,ne=[1,1,320,320],nr=[1,1,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f16,ne=[1,1,320,320],nr=[1,1,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f16,ne=[1,1,320,320],nr=[1,1,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f16,ne=[10,5,1,1],nr=[1,1,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f16,ne=[10,5,1,1],nr=[1,1,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f16,ne=[10,5,1,1],nr=[1,1,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f16,ne=[10,5,1,1],nr=[1,1,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f16,ne=[10,5,4,1],nr=[1,1,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f16,ne=[10,5,4,1],nr=[1,1,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f16,ne=[10,5,4,1],nr=[1,1,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f16,ne=[10,5,4,1],nr=[1,1,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f16,ne=[10,5,4,3],nr=[1,1,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f16,ne=[10,5,4,3],nr=[1,1,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f16,ne=[10,5,4,3],nr=[1,1,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f16,ne=[10,5,4,3],nr=[1,1,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f16,ne=[10,5,4,3],nr=[2,1,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f16,ne=[10,5,4,3],nr=[2,1,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f16,ne=[10,5,4,3],nr=[2,1,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f16,ne=[10,5,4,3],nr=[2,1,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f16,ne=[10,5,4,3],nr=[1,2,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f16,ne=[10,5,4,3],nr=[1,2,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f16,ne=[10,5,4,3],nr=[1,2,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f16,ne=[10,5,4,3],nr=[1,2,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f16,ne=[10,5,4,3],nr=[1,1,2,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f16,ne=[10,5,4,3],nr=[1,1,2,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f16,ne=[10,5,4,3],nr=[1,1,2,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f16,ne=[10,5,4,3],nr=[1,1,2,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f16,ne=[10,5,4,3],nr=[1,1,1,2],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f16,ne=[10,5,4,3],nr=[1,1,1,2],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f16,ne=[10,5,4,3],nr=[1,1,1,2],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f16,ne=[10,5,4,3],nr=[1,1,1,2],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f16,ne=[10,5,4,3],nr=[1,1,2,2],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f16,ne=[10,5,4,3],nr=[1,1,2,2],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f16,ne=[10,5,4,3],nr=[1,1,2,2],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f16,ne=[10,5,4,3],nr=[1,1,2,2],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f16,ne=[10,5,4,3],nr=[1,2,2,2],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f16,ne=[10,5,4,3],nr=[1,2,2,2],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f16,ne=[10,5,4,3],nr=[1,2,2,2],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f16,ne=[10,5,4,3],nr=[1,2,2,2],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f16,ne=[10,5,4,3],nr=[2,2,2,2],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f16,ne=[10,5,4,3],nr=[2,2,2,2],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f16,ne=[10,5,4,3],nr=[2,2,2,2],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f16,ne=[10,5,4,3],nr=[2,2,2,2],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f16,ne=[10,5,4,6],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f16,ne=[10,5,4,6],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f16,ne=[10,5,4,6],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f16,ne=[10,5,4,6],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f16,ne=[10,5,4,5],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f16,ne=[10,5,4,5],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f16,ne=[10,5,4,5],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f16,ne=[10,5,4,5],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f16,ne=[1,1,120,120],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f16,ne=[1,1,120,120],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f16,ne=[1,1,120,120],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f16,ne=[1,1,120,120],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f16,ne=[1,1,4,320],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f16,ne=[1,1,4,320],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f16,ne=[1,1,4,320],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f16,ne=[1,1,4,320],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f16,ne=[1,1,65536,1],nr=[256,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f16,ne=[1,1,65536,1],nr=[256,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f16,ne=[1,1,65536,1],nr=[256,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f16,ne=[1,1,65536,1],nr=[256,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f16,ne=[1280,1,1,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f16,ne=[1280,1,1,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f16,ne=[1280,1,1,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f16,ne=[1280,1,1,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f16,ne=[1280,1,1,1],nr=[1,16,16,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f16,ne=[1280,1,1,1],nr=[1,16,16,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f16,ne=[1280,1,1,1],nr=[1,16,16,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f16,ne=[1280,1,1,1],nr=[1,16,16,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f16,ne=[1280,16,16,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f16,ne=[1280,16,16,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f16,ne=[1280,16,16,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f16,ne=[1280,16,16,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f16,ne=[1280,1,1,1],nr=[1,256,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f16,ne=[1280,1,1,1],nr=[1,256,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f16,ne=[1280,1,1,1],nr=[1,256,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f16,ne=[1280,1,1,1],nr=[1,256,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f16,ne=[1,1,1280,1],nr=[16,16,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f16,ne=[1,1,1280,1],nr=[16,16,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f16,ne=[1,1,1280,1],nr=[16,16,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f16,ne=[1,1,1280,1],nr=[16,16,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f16,ne=[16,16,1280,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f16,ne=[16,16,1280,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f16,ne=[16,16,1280,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f16,ne=[16,16,1280,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f16,ne=[1,1,1920,1],nr=[16,16,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f16,ne=[1,1,1920,1],nr=[16,16,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f16,ne=[1,1,1920,1],nr=[16,16,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f16,ne=[1,1,1920,1],nr=[16,16,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f16,ne=[1,1,2560,1],nr=[16,16,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f16,ne=[1,1,2560,1],nr=[16,16,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f16,ne=[1,1,2560,1],nr=[16,16,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f16,ne=[1,1,2560,1],nr=[16,16,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f16,ne=[1,1,1280,1],nr=[32,32,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f16,ne=[1,1,1280,1],nr=[32,32,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f16,ne=[1,1,1280,1],nr=[32,32,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f16,ne=[1,1,1280,1],nr=[32,32,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f16,ne=[1,1,1920,1],nr=[32,32,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f16,ne=[1,1,1920,1],nr=[32,32,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f16,ne=[1,1,1920,1],nr=[32,32,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f16,ne=[1,1,1920,1],nr=[32,32,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f16,ne=[1,1,640,1],nr=[32,32,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f16,ne=[1,1,640,1],nr=[32,32,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f16,ne=[1,1,640,1],nr=[32,32,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f16,ne=[1,1,640,1],nr=[32,32,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f16,ne=[5120,1,1,1],nr=[1,256,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f16,ne=[5120,1,1,1],nr=[1,256,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f16,ne=[5120,1,1,1],nr=[1,256,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f16,ne=[5120,1,1,1],nr=[1,256,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f16,ne=[640,1,1,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f16,ne=[640,1,1,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f16,ne=[640,1,1,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f16,ne=[640,1,1,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f16,ne=[64,262144,1,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f16,ne=[64,262144,1,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f16,ne=[64,262144,1,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f16,ne=[64,262144,1,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f32,ne=[1,1,8,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f32,ne=[1,1,8,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f32,ne=[1,1,8,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f32,ne=[1,1,8,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f32,ne=[1,1,1,1],nr=[32,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f32,ne=[1,1,1,1],nr=[32,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f32,ne=[1,1,1,1],nr=[32,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f32,ne=[1,1,1,1],nr=[32,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f32,ne=[1,1,320,320],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f32,ne=[1,1,320,320],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f32,ne=[1,1,320,320],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f32,ne=[1,1,320,320],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f32,ne=[10,5,1,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f32,ne=[10,5,1,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f32,ne=[10,5,1,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f32,ne=[10,5,1,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f32,ne=[10,5,4,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f32,ne=[10,5,4,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f32,ne=[10,5,4,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f32,ne=[10,5,4,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f32,ne=[10,5,4,3],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f32,ne=[10,5,4,3],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f32,ne=[10,5,4,3],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f32,ne=[10,5,4,3],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f32,ne=[10,5,4,3],nr=[2,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f32,ne=[10,5,4,3],nr=[2,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f32,ne=[10,5,4,3],nr=[2,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f32,ne=[10,5,4,3],nr=[2,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f32,ne=[10,5,4,3],nr=[1,2,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f32,ne=[10,5,4,3],nr=[1,2,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f32,ne=[10,5,4,3],nr=[1,2,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f32,ne=[10,5,4,3],nr=[1,2,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f32,ne=[10,5,4,3],nr=[1,1,2,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f32,ne=[10,5,4,3],nr=[1,1,2,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f32,ne=[10,5,4,3],nr=[1,1,2,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f32,ne=[10,5,4,3],nr=[1,1,2,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f32,ne=[10,5,4,3],nr=[1,1,1,2],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f32,ne=[10,5,4,3],nr=[1,1,1,2],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f32,ne=[10,5,4,3],nr=[1,1,1,2],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f32,ne=[10,5,4,3],nr=[1,1,1,2],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f32,ne=[10,5,4,3],nr=[1,1,2,2],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f32,ne=[10,5,4,3],nr=[1,1,2,2],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f32,ne=[10,5,4,3],nr=[1,1,2,2],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f32,ne=[10,5,4,3],nr=[1,1,2,2],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f32,ne=[10,5,4,3],nr=[1,2,2,2],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f32,ne=[10,5,4,3],nr=[1,2,2,2],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f32,ne=[10,5,4,3],nr=[1,2,2,2],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f32,ne=[10,5,4,3],nr=[1,2,2,2],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f32,ne=[10,5,4,3],nr=[2,2,2,2],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f32,ne=[10,5,4,3],nr=[2,2,2,2],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f32,ne=[10,5,4,3],nr=[2,2,2,2],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f32,ne=[10,5,4,3],nr=[2,2,2,2],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f32,ne=[1,1,8,1],nr=[1,1,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f32,ne=[1,1,8,1],nr=[1,1,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f32,ne=[1,1,8,1],nr=[1,1,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f32,ne=[1,1,8,1],nr=[1,1,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f32,ne=[1,1,1,1],nr=[32,1,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f32,ne=[1,1,1,1],nr=[32,1,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f32,ne=[1,1,1,1],nr=[32,1,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f32,ne=[1,1,1,1],nr=[32,1,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f32,ne=[1,1,320,320],nr=[1,1,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f32,ne=[1,1,320,320],nr=[1,1,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f32,ne=[1,1,320,320],nr=[1,1,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f32,ne=[1,1,320,320],nr=[1,1,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f32,ne=[10,5,1,1],nr=[1,1,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f32,ne=[10,5,1,1],nr=[1,1,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f32,ne=[10,5,1,1],nr=[1,1,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f32,ne=[10,5,1,1],nr=[1,1,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f32,ne=[10,5,4,1],nr=[1,1,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f32,ne=[10,5,4,1],nr=[1,1,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f32,ne=[10,5,4,1],nr=[1,1,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f32,ne=[10,5,4,1],nr=[1,1,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f32,ne=[10,5,4,3],nr=[1,1,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f32,ne=[10,5,4,3],nr=[1,1,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f32,ne=[10,5,4,3],nr=[1,1,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f32,ne=[10,5,4,3],nr=[1,1,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f32,ne=[10,5,4,3],nr=[2,1,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f32,ne=[10,5,4,3],nr=[2,1,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f32,ne=[10,5,4,3],nr=[2,1,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f32,ne=[10,5,4,3],nr=[2,1,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f32,ne=[10,5,4,3],nr=[1,2,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f32,ne=[10,5,4,3],nr=[1,2,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f32,ne=[10,5,4,3],nr=[1,2,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f32,ne=[10,5,4,3],nr=[1,2,1,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f32,ne=[10,5,4,3],nr=[1,1,2,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f32,ne=[10,5,4,3],nr=[1,1,2,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f32,ne=[10,5,4,3],nr=[1,1,2,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f32,ne=[10,5,4,3],nr=[1,1,2,1],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f32,ne=[10,5,4,3],nr=[1,1,1,2],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f32,ne=[10,5,4,3],nr=[1,1,1,2],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f32,ne=[10,5,4,3],nr=[1,1,1,2],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f32,ne=[10,5,4,3],nr=[1,1,1,2],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f32,ne=[10,5,4,3],nr=[1,1,2,2],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f32,ne=[10,5,4,3],nr=[1,1,2,2],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f32,ne=[10,5,4,3],nr=[1,1,2,2],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f32,ne=[10,5,4,3],nr=[1,1,2,2],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f32,ne=[10,5,4,3],nr=[1,2,2,2],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f32,ne=[10,5,4,3],nr=[1,2,2,2],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f32,ne=[10,5,4,3],nr=[1,2,2,2],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f32,ne=[10,5,4,3],nr=[1,2,2,2],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f32,ne=[10,5,4,3],nr=[2,2,2,2],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f32,ne=[10,5,4,3],nr=[2,2,2,2],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f32,ne=[10,5,4,3],nr=[2,2,2,2],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f32,ne=[10,5,4,3],nr=[2,2,2,2],nf=1,perm1=1","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f32,ne=[10,5,4,6],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f32,ne=[10,5,4,6],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f32,ne=[10,5,4,6],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f32,ne=[10,5,4,6],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f32,ne=[10,5,4,5],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f32,ne=[10,5,4,5],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f32,ne=[10,5,4,5],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f32,ne=[10,5,4,5],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f32,ne=[1,1,120,120],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f32,ne=[1,1,120,120],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f32,ne=[1,1,120,120],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f32,ne=[1,1,120,120],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f32,ne=[1,1,4,320],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f32,ne=[1,1,4,320],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f32,ne=[1,1,4,320],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f32,ne=[1,1,4,320],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f32,ne=[1,1,65536,1],nr=[256,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f32,ne=[1,1,65536,1],nr=[256,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f32,ne=[1,1,65536,1],nr=[256,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f32,ne=[1,1,65536,1],nr=[256,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f32,ne=[1280,1,1,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f32,ne=[1280,1,1,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f32,ne=[1280,1,1,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f32,ne=[1280,1,1,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f32,ne=[1280,1,1,1],nr=[1,16,16,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f32,ne=[1280,1,1,1],nr=[1,16,16,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f32,ne=[1280,1,1,1],nr=[1,16,16,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f32,ne=[1280,1,1,1],nr=[1,16,16,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f32,ne=[1280,16,16,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f32,ne=[1280,16,16,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f32,ne=[1280,16,16,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f32,ne=[1280,16,16,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f32,ne=[1280,1,1,1],nr=[1,256,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f32,ne=[1280,1,1,1],nr=[1,256,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f32,ne=[1280,1,1,1],nr=[1,256,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f32,ne=[1280,1,1,1],nr=[1,256,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f32,ne=[1,1,1280,1],nr=[16,16,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f32,ne=[1,1,1280,1],nr=[16,16,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f32,ne=[1,1,1280,1],nr=[16,16,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f32,ne=[1,1,1280,1],nr=[16,16,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f32,ne=[16,16,1280,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f32,ne=[16,16,1280,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f32,ne=[16,16,1280,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f32,ne=[16,16,1280,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f32,ne=[1,1,1920,1],nr=[16,16,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f32,ne=[1,1,1920,1],nr=[16,16,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f32,ne=[1,1,1920,1],nr=[16,16,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f32,ne=[1,1,1920,1],nr=[16,16,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f32,ne=[1,1,2560,1],nr=[16,16,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f32,ne=[1,1,2560,1],nr=[16,16,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f32,ne=[1,1,2560,1],nr=[16,16,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f32,ne=[1,1,2560,1],nr=[16,16,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f32,ne=[1,1,1280,1],nr=[32,32,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f32,ne=[1,1,1280,1],nr=[32,32,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f32,ne=[1,1,1280,1],nr=[32,32,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f32,ne=[1,1,1280,1],nr=[32,32,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f32,ne=[1,1,1920,1],nr=[32,32,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f32,ne=[1,1,1920,1],nr=[32,32,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f32,ne=[1,1,1920,1],nr=[32,32,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f32,ne=[1,1,1920,1],nr=[32,32,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f32,ne=[1,1,640,1],nr=[32,32,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f32,ne=[1,1,640,1],nr=[32,32,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f32,ne=[1,1,640,1],nr=[32,32,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f32,ne=[1,1,640,1],nr=[32,32,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f32,ne=[5120,1,1,1],nr=[1,256,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f32,ne=[5120,1,1,1],nr=[1,256,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f32,ne=[5120,1,1,1],nr=[1,256,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f32,ne=[5120,1,1,1],nr=[1,256,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f32,ne=[640,1,1,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f32,ne=[640,1,1,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f32,ne=[640,1,1,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f32,ne=[640,1,1,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD","type=f32,ne=[64,262144,1,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","SUB","type=f32,ne=[64,262144,1,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","MUL","type=f32,ne=[64,262144,1,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","DIV","type=f32,ne=[64,262144,1,1],nr=[1,1,1,1],nf=1,perm1=0","support","1","yes","Vulkan" -"Vulkan0","ADD1","type=f32,ne=[10,5,4,3]","support","1","yes","Vulkan" -"Vulkan0","ADD1","type=f32,ne=[1024,1024,1,1]","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f16,ne=[1,1,1,1],nr=[32,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f16,ne=[1,1,1,1],nr=[32,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f16,ne=[1,1,1,1],nr=[32,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f16,ne=[1,1,1,1],nr=[32,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f16,ne=[1,1,320,320],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f16,ne=[1,1,320,320],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f16,ne=[1,1,320,320],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f16,ne=[1,1,320,320],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f16,ne=[10,5,1,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f16,ne=[10,5,1,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f16,ne=[10,5,1,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f16,ne=[10,5,1,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f16,ne=[10,5,4,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f16,ne=[10,5,4,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f16,ne=[10,5,4,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f16,ne=[10,5,4,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f16,ne=[10,5,4,3],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f16,ne=[10,5,4,3],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f16,ne=[10,5,4,3],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f16,ne=[10,5,4,3],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f16,ne=[10,5,4,3],nr=[2,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f16,ne=[10,5,4,3],nr=[2,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f16,ne=[10,5,4,3],nr=[2,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f16,ne=[10,5,4,3],nr=[2,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f16,ne=[10,5,4,3],nr=[1,2,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f16,ne=[10,5,4,3],nr=[1,2,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f16,ne=[10,5,4,3],nr=[1,2,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f16,ne=[10,5,4,3],nr=[1,2,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f16,ne=[10,5,4,3],nr=[1,1,2,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f16,ne=[10,5,4,3],nr=[1,1,2,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f16,ne=[10,5,4,3],nr=[1,1,2,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f16,ne=[10,5,4,3],nr=[1,1,2,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f16,ne=[10,5,4,3],nr=[1,1,1,2],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f16,ne=[10,5,4,3],nr=[1,1,1,2],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f16,ne=[10,5,4,3],nr=[1,1,1,2],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f16,ne=[10,5,4,3],nr=[1,1,1,2],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f16,ne=[10,5,4,3],nr=[1,1,2,2],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f16,ne=[10,5,4,3],nr=[1,1,2,2],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f16,ne=[10,5,4,3],nr=[1,1,2,2],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f16,ne=[10,5,4,3],nr=[1,1,2,2],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f16,ne=[10,5,4,3],nr=[1,2,2,2],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f16,ne=[10,5,4,3],nr=[1,2,2,2],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f16,ne=[10,5,4,3],nr=[1,2,2,2],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f16,ne=[10,5,4,3],nr=[1,2,2,2],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f16,ne=[10,5,4,3],nr=[2,2,2,2],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f16,ne=[10,5,4,3],nr=[2,2,2,2],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f16,ne=[10,5,4,3],nr=[2,2,2,2],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f16,ne=[10,5,4,3],nr=[2,2,2,2],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f16,ne=[1,1,8,1],nr=[1,1,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f16,ne=[1,1,1,1],nr=[32,1,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f16,ne=[1,1,1,1],nr=[32,1,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f16,ne=[1,1,1,1],nr=[32,1,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f16,ne=[1,1,1,1],nr=[32,1,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f16,ne=[1,1,320,320],nr=[1,1,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f16,ne=[1,1,320,320],nr=[1,1,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f16,ne=[1,1,320,320],nr=[1,1,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f16,ne=[1,1,320,320],nr=[1,1,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f16,ne=[10,5,1,1],nr=[1,1,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f16,ne=[10,5,1,1],nr=[1,1,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f16,ne=[10,5,1,1],nr=[1,1,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f16,ne=[10,5,1,1],nr=[1,1,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f16,ne=[10,5,4,1],nr=[1,1,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f16,ne=[10,5,4,1],nr=[1,1,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f16,ne=[10,5,4,1],nr=[1,1,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f16,ne=[10,5,4,1],nr=[1,1,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f16,ne=[10,5,4,3],nr=[1,1,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f16,ne=[10,5,4,3],nr=[1,1,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f16,ne=[10,5,4,3],nr=[1,1,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f16,ne=[10,5,4,3],nr=[1,1,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f16,ne=[10,5,4,3],nr=[2,1,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f16,ne=[10,5,4,3],nr=[2,1,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f16,ne=[10,5,4,3],nr=[2,1,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f16,ne=[10,5,4,3],nr=[2,1,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f16,ne=[10,5,4,3],nr=[1,2,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f16,ne=[10,5,4,3],nr=[1,2,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f16,ne=[10,5,4,3],nr=[1,2,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f16,ne=[10,5,4,3],nr=[1,2,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f16,ne=[10,5,4,3],nr=[1,1,2,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f16,ne=[10,5,4,3],nr=[1,1,2,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f16,ne=[10,5,4,3],nr=[1,1,2,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f16,ne=[10,5,4,3],nr=[1,1,2,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f16,ne=[10,5,4,3],nr=[1,1,1,2],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f16,ne=[10,5,4,3],nr=[1,1,1,2],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f16,ne=[10,5,4,3],nr=[1,1,1,2],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f16,ne=[10,5,4,3],nr=[1,1,1,2],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f16,ne=[10,5,4,3],nr=[1,1,2,2],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f16,ne=[10,5,4,3],nr=[1,1,2,2],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f16,ne=[10,5,4,3],nr=[1,1,2,2],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f16,ne=[10,5,4,3],nr=[1,1,2,2],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f16,ne=[10,5,4,3],nr=[1,2,2,2],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f16,ne=[10,5,4,3],nr=[1,2,2,2],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f16,ne=[10,5,4,3],nr=[1,2,2,2],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f16,ne=[10,5,4,3],nr=[1,2,2,2],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f16,ne=[10,5,4,3],nr=[2,2,2,2],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f16,ne=[10,5,4,3],nr=[2,2,2,2],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f16,ne=[10,5,4,3],nr=[2,2,2,2],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f16,ne=[10,5,4,3],nr=[2,2,2,2],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f16,ne=[10,5,4,6],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=1","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f16,ne=[10,5,4,6],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=1","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f16,ne=[10,5,4,6],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=1","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f16,ne=[10,5,4,6],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=1","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f16,ne=[10,5,4,5],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=1","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f16,ne=[10,5,4,5],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=1","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f16,ne=[10,5,4,5],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=1","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f16,ne=[10,5,4,5],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=1","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f16,ne=[1,1,120,120],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=1","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f16,ne=[1,1,120,120],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=1","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f16,ne=[1,1,120,120],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=1","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f16,ne=[1,1,120,120],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=1","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f16,ne=[1,1,4,320],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=1","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f16,ne=[1,1,4,320],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=1","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f16,ne=[1,1,4,320],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=1","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f16,ne=[1,1,4,320],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=1","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f16,ne=[1,1,65536,1],nr=[256,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f16,ne=[1,1,65536,1],nr=[256,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f16,ne=[1,1,65536,1],nr=[256,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f16,ne=[1,1,65536,1],nr=[256,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f16,ne=[1280,1,1,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f16,ne=[1280,1,1,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f16,ne=[1280,1,1,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f16,ne=[1280,1,1,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f16,ne=[1280,1,1,1],nr=[1,16,16,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f16,ne=[1280,1,1,1],nr=[1,16,16,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f16,ne=[1280,1,1,1],nr=[1,16,16,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f16,ne=[1280,1,1,1],nr=[1,16,16,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f16,ne=[1280,16,16,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f16,ne=[1280,16,16,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f16,ne=[1280,16,16,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f16,ne=[1280,16,16,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f16,ne=[1280,1,1,1],nr=[1,256,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f16,ne=[1280,1,1,1],nr=[1,256,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f16,ne=[1280,1,1,1],nr=[1,256,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f16,ne=[1280,1,1,1],nr=[1,256,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f16,ne=[1,1,1280,1],nr=[16,16,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f16,ne=[1,1,1280,1],nr=[16,16,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f16,ne=[1,1,1280,1],nr=[16,16,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f16,ne=[1,1,1280,1],nr=[16,16,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f16,ne=[16,16,1280,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f16,ne=[16,16,1280,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f16,ne=[16,16,1280,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f16,ne=[16,16,1280,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f16,ne=[1,1,1920,1],nr=[16,16,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f16,ne=[1,1,1920,1],nr=[16,16,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f16,ne=[1,1,1920,1],nr=[16,16,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f16,ne=[1,1,1920,1],nr=[16,16,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f16,ne=[1,1,2560,1],nr=[16,16,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f16,ne=[1,1,2560,1],nr=[16,16,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f16,ne=[1,1,2560,1],nr=[16,16,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f16,ne=[1,1,2560,1],nr=[16,16,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f16,ne=[1,1,1280,1],nr=[32,32,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f16,ne=[1,1,1280,1],nr=[32,32,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f16,ne=[1,1,1280,1],nr=[32,32,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f16,ne=[1,1,1280,1],nr=[32,32,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f16,ne=[1,1,1920,1],nr=[32,32,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f16,ne=[1,1,1920,1],nr=[32,32,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f16,ne=[1,1,1920,1],nr=[32,32,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f16,ne=[1,1,1920,1],nr=[32,32,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f16,ne=[1,1,640,1],nr=[32,32,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f16,ne=[1,1,640,1],nr=[32,32,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f16,ne=[1,1,640,1],nr=[32,32,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f16,ne=[1,1,640,1],nr=[32,32,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f16,ne=[5120,1,1,1],nr=[1,256,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f16,ne=[5120,1,1,1],nr=[1,256,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f16,ne=[5120,1,1,1],nr=[1,256,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f16,ne=[5120,1,1,1],nr=[1,256,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f16,ne=[640,1,1,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f16,ne=[640,1,1,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f16,ne=[640,1,1,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f16,ne=[640,1,1,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f16,ne=[64,262144,1,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f16,ne=[64,262144,1,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f16,ne=[64,262144,1,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f16,ne=[64,262144,1,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f32,ne=[1,1,8,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f32,ne=[1,1,8,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f32,ne=[1,1,8,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f32,ne=[1,1,8,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f32,ne=[1,1,1,1],nr=[32,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f32,ne=[1,1,1,1],nr=[32,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f32,ne=[1,1,1,1],nr=[32,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f32,ne=[1,1,1,1],nr=[32,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f32,ne=[1,1,320,320],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f32,ne=[1,1,320,320],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f32,ne=[1,1,320,320],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f32,ne=[1,1,320,320],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f32,ne=[10,5,1,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f32,ne=[10,5,1,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f32,ne=[10,5,1,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f32,ne=[10,5,1,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f32,ne=[10,5,4,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f32,ne=[10,5,4,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f32,ne=[10,5,4,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f32,ne=[10,5,4,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f32,ne=[10,5,4,3],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f32,ne=[10,5,4,3],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f32,ne=[10,5,4,3],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f32,ne=[10,5,4,3],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f32,ne=[10,5,4,3],nr=[2,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f32,ne=[10,5,4,3],nr=[2,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f32,ne=[10,5,4,3],nr=[2,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f32,ne=[10,5,4,3],nr=[2,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f32,ne=[10,5,4,3],nr=[1,2,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f32,ne=[10,5,4,3],nr=[1,2,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f32,ne=[10,5,4,3],nr=[1,2,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f32,ne=[10,5,4,3],nr=[1,2,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f32,ne=[10,5,4,3],nr=[1,1,2,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f32,ne=[10,5,4,3],nr=[1,1,2,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f32,ne=[10,5,4,3],nr=[1,1,2,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f32,ne=[10,5,4,3],nr=[1,1,2,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f32,ne=[10,5,4,3],nr=[1,1,1,2],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f32,ne=[10,5,4,3],nr=[1,1,1,2],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f32,ne=[10,5,4,3],nr=[1,1,1,2],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f32,ne=[10,5,4,3],nr=[1,1,1,2],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f32,ne=[10,5,4,3],nr=[1,1,2,2],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f32,ne=[10,5,4,3],nr=[1,1,2,2],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f32,ne=[10,5,4,3],nr=[1,1,2,2],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f32,ne=[10,5,4,3],nr=[1,1,2,2],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f32,ne=[10,5,4,3],nr=[1,2,2,2],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f32,ne=[10,5,4,3],nr=[1,2,2,2],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f32,ne=[10,5,4,3],nr=[1,2,2,2],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f32,ne=[10,5,4,3],nr=[1,2,2,2],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f32,ne=[10,5,4,3],nr=[2,2,2,2],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f32,ne=[10,5,4,3],nr=[2,2,2,2],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f32,ne=[10,5,4,3],nr=[2,2,2,2],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f32,ne=[10,5,4,3],nr=[2,2,2,2],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f32,ne=[1,1,8,1],nr=[1,1,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f32,ne=[1,1,8,1],nr=[1,1,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f32,ne=[1,1,8,1],nr=[1,1,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f32,ne=[1,1,8,1],nr=[1,1,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f32,ne=[1,1,1,1],nr=[32,1,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f32,ne=[1,1,1,1],nr=[32,1,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f32,ne=[1,1,1,1],nr=[32,1,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f32,ne=[1,1,1,1],nr=[32,1,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f32,ne=[1,1,320,320],nr=[1,1,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f32,ne=[1,1,320,320],nr=[1,1,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f32,ne=[1,1,320,320],nr=[1,1,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f32,ne=[1,1,320,320],nr=[1,1,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f32,ne=[10,5,1,1],nr=[1,1,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f32,ne=[10,5,1,1],nr=[1,1,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f32,ne=[10,5,1,1],nr=[1,1,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f32,ne=[10,5,1,1],nr=[1,1,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f32,ne=[10,5,4,1],nr=[1,1,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f32,ne=[10,5,4,1],nr=[1,1,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f32,ne=[10,5,4,1],nr=[1,1,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f32,ne=[10,5,4,1],nr=[1,1,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f32,ne=[10,5,4,3],nr=[1,1,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f32,ne=[10,5,4,3],nr=[1,1,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f32,ne=[10,5,4,3],nr=[1,1,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f32,ne=[10,5,4,3],nr=[1,1,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f32,ne=[10,5,4,3],nr=[2,1,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f32,ne=[10,5,4,3],nr=[2,1,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f32,ne=[10,5,4,3],nr=[2,1,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f32,ne=[10,5,4,3],nr=[2,1,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f32,ne=[10,5,4,3],nr=[1,2,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f32,ne=[10,5,4,3],nr=[1,2,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f32,ne=[10,5,4,3],nr=[1,2,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f32,ne=[10,5,4,3],nr=[1,2,1,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f32,ne=[10,5,4,3],nr=[1,1,2,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f32,ne=[10,5,4,3],nr=[1,1,2,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f32,ne=[10,5,4,3],nr=[1,1,2,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f32,ne=[10,5,4,3],nr=[1,1,2,1],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f32,ne=[10,5,4,3],nr=[1,1,1,2],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f32,ne=[10,5,4,3],nr=[1,1,1,2],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f32,ne=[10,5,4,3],nr=[1,1,1,2],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f32,ne=[10,5,4,3],nr=[1,1,1,2],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f32,ne=[10,5,4,3],nr=[1,1,2,2],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f32,ne=[10,5,4,3],nr=[1,1,2,2],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f32,ne=[10,5,4,3],nr=[1,1,2,2],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f32,ne=[10,5,4,3],nr=[1,1,2,2],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f32,ne=[10,5,4,3],nr=[1,2,2,2],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f32,ne=[10,5,4,3],nr=[1,2,2,2],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f32,ne=[10,5,4,3],nr=[1,2,2,2],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f32,ne=[10,5,4,3],nr=[1,2,2,2],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f32,ne=[10,5,4,3],nr=[2,2,2,2],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f32,ne=[10,5,4,3],nr=[2,2,2,2],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f32,ne=[10,5,4,3],nr=[2,2,2,2],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f32,ne=[10,5,4,3],nr=[2,2,2,2],nf=1,perm1=1,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f32,ne=[10,5,4,6],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=1","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f32,ne=[10,5,4,6],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=1","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f32,ne=[10,5,4,6],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=1","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f32,ne=[10,5,4,6],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=1","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f32,ne=[10,5,4,5],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=1","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f32,ne=[10,5,4,5],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=1","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f32,ne=[10,5,4,5],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=1","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f32,ne=[10,5,4,5],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=1","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f32,ne=[1,1,120,120],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=1","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f32,ne=[1,1,120,120],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=1","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f32,ne=[1,1,120,120],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=1","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f32,ne=[1,1,120,120],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=1","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f32,ne=[1,1,4,320],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=1","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f32,ne=[1,1,4,320],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=1","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f32,ne=[1,1,4,320],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=1","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f32,ne=[1,1,4,320],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=1","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f32,ne=[1,1,65536,1],nr=[256,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f32,ne=[1,1,65536,1],nr=[256,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f32,ne=[1,1,65536,1],nr=[256,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f32,ne=[1,1,65536,1],nr=[256,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f32,ne=[1280,1,1,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f32,ne=[1280,1,1,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f32,ne=[1280,1,1,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f32,ne=[1280,1,1,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f32,ne=[1280,1,1,1],nr=[1,16,16,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f32,ne=[1280,1,1,1],nr=[1,16,16,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f32,ne=[1280,1,1,1],nr=[1,16,16,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f32,ne=[1280,1,1,1],nr=[1,16,16,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f32,ne=[1280,16,16,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f32,ne=[1280,16,16,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f32,ne=[1280,16,16,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f32,ne=[1280,16,16,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f32,ne=[1280,1,1,1],nr=[1,256,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f32,ne=[1280,1,1,1],nr=[1,256,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f32,ne=[1280,1,1,1],nr=[1,256,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f32,ne=[1280,1,1,1],nr=[1,256,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f32,ne=[1,1,1280,1],nr=[16,16,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f32,ne=[1,1,1280,1],nr=[16,16,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f32,ne=[1,1,1280,1],nr=[16,16,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f32,ne=[1,1,1280,1],nr=[16,16,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f32,ne=[16,16,1280,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f32,ne=[16,16,1280,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f32,ne=[16,16,1280,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f32,ne=[16,16,1280,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f32,ne=[1,1,1920,1],nr=[16,16,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f32,ne=[1,1,1920,1],nr=[16,16,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f32,ne=[1,1,1920,1],nr=[16,16,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f32,ne=[1,1,1920,1],nr=[16,16,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f32,ne=[1,1,2560,1],nr=[16,16,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f32,ne=[1,1,2560,1],nr=[16,16,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f32,ne=[1,1,2560,1],nr=[16,16,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f32,ne=[1,1,2560,1],nr=[16,16,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f32,ne=[1,1,1280,1],nr=[32,32,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f32,ne=[1,1,1280,1],nr=[32,32,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f32,ne=[1,1,1280,1],nr=[32,32,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f32,ne=[1,1,1280,1],nr=[32,32,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f32,ne=[1,1,1920,1],nr=[32,32,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f32,ne=[1,1,1920,1],nr=[32,32,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f32,ne=[1,1,1920,1],nr=[32,32,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f32,ne=[1,1,1920,1],nr=[32,32,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f32,ne=[1,1,640,1],nr=[32,32,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f32,ne=[1,1,640,1],nr=[32,32,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f32,ne=[1,1,640,1],nr=[32,32,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f32,ne=[1,1,640,1],nr=[32,32,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f32,ne=[5120,1,1,1],nr=[1,256,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f32,ne=[5120,1,1,1],nr=[1,256,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f32,ne=[5120,1,1,1],nr=[1,256,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f32,ne=[5120,1,1,1],nr=[1,256,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f32,ne=[640,1,1,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f32,ne=[640,1,1,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f32,ne=[640,1,1,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f32,ne=[640,1,1,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","ADD","type=f32,ne=[64,262144,1,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SUB","type=f32,ne=[64,262144,1,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","MUL","type=f32,ne=[64,262144,1,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" +"Vulkan0","DIV","type=f32,ne=[64,262144,1,1],nr=[1,1,1,1],nf=1,perm1=0,src_overlap=0","support","1","yes","Vulkan" "Vulkan0","SCALE","type=f32,ne=[10,10,10,10],scale=2.000000,bias=0.000000,inplace=0","support","1","yes","Vulkan" "Vulkan0","SCALE","type=f32,ne=[10,10,10,10],scale=2.000000,bias=1.000000,inplace=0","support","1","yes","Vulkan" "Vulkan0","SCALE","type=f32,ne=[10,10,10,10],scale=2.000000,bias=1.000000,inplace=1","support","1","yes","Vulkan" @@ -5937,6 +6108,20 @@ "Vulkan0","RMS_NORM_BACK","type=f32,ne=[1025,5,4,3],eps=0.100000","support","1","yes","Vulkan" "Vulkan0","L2_NORM","type=f32,ne=[1025,5,4,3],eps=0.100000,v=0","support","1","yes","Vulkan" "Vulkan0","L2_NORM","type=f32,ne=[1025,5,4,3],eps=0.100000,v=1","support","1","yes","Vulkan" +"Vulkan0","NORM","type=f32,ne=[64,5,4,3],v=0,eps=10.000000","support","1","yes","Vulkan" +"Vulkan0","RMS_NORM","type=f32,ne=[64,5,4,3],v=0,eps=10.000000,inplace=0","support","1","yes","Vulkan" +"Vulkan0","NORM","type=f32,ne=[64,5,4,3],v=1,eps=10.000000","support","0","no","Vulkan" +"Vulkan0","RMS_NORM","type=f32,ne=[64,5,4,3],v=1,eps=10.000000,inplace=0","support","1","yes","Vulkan" +"Vulkan0","RMS_NORM_BACK","type=f32,ne=[64,5,4,3],eps=10.000000","support","1","yes","Vulkan" +"Vulkan0","L2_NORM","type=f32,ne=[64,5,4,3],eps=10.000000,v=0","support","1","yes","Vulkan" +"Vulkan0","L2_NORM","type=f32,ne=[64,5,4,3],eps=10.000000,v=1","support","1","yes","Vulkan" +"Vulkan0","NORM","type=f32,ne=[1025,5,4,3],v=0,eps=10.000000","support","1","yes","Vulkan" +"Vulkan0","RMS_NORM","type=f32,ne=[1025,5,4,3],v=0,eps=10.000000,inplace=0","support","1","yes","Vulkan" +"Vulkan0","NORM","type=f32,ne=[1025,5,4,3],v=1,eps=10.000000","support","0","no","Vulkan" +"Vulkan0","RMS_NORM","type=f32,ne=[1025,5,4,3],v=1,eps=10.000000,inplace=0","support","1","yes","Vulkan" +"Vulkan0","RMS_NORM_BACK","type=f32,ne=[1025,5,4,3],eps=10.000000","support","1","yes","Vulkan" +"Vulkan0","L2_NORM","type=f32,ne=[1025,5,4,3],eps=10.000000,v=0","support","1","yes","Vulkan" +"Vulkan0","L2_NORM","type=f32,ne=[1025,5,4,3],eps=10.000000,v=1","support","1","yes","Vulkan" "Vulkan0","RMS_NORM","type=f32,ne=[64,5,4,3],v=0,eps=0.000001,inplace=1","support","1","yes","Vulkan" "Vulkan0","SSM_CONV","type=f32,ne_a=[3,1024,1,1],ne_b=[3,1024,1,1]","support","1","yes","Vulkan" "Vulkan0","SSM_CONV","type=f32,ne_a=[6,1024,1,1],ne_b=[3,1024,1,1]","support","1","yes","Vulkan" @@ -5983,9 +6168,10 @@ "Vulkan0","SSM_CONV","type=f32,ne_a=[9,2048,4,1],ne_b=[9,2048,1,1]","support","1","yes","Vulkan" "Vulkan0","SSM_CONV","type=f32,ne_a=[72,2048,1,1],ne_b=[9,2048,1,1]","support","1","yes","Vulkan" "Vulkan0","SSM_CONV","type=f32,ne_a=[72,2048,4,1],ne_b=[9,2048,1,1]","support","1","yes","Vulkan" -"Vulkan0","SSM_SCAN","type=f32,d_state=16,head_dim=1,n_head=1024,n_group=1,n_seq_tokens=32,n_seqs=4","support","0","no","Vulkan" -"Vulkan0","SSM_SCAN","type=f32,d_state=128,head_dim=64,n_head=16,n_group=2,n_seq_tokens=32,n_seqs=4","support","1","yes","Vulkan" -"Vulkan0","SSM_SCAN","type=f32,d_state=256,head_dim=64,n_head=8,n_group=2,n_seq_tokens=32,n_seqs=4","support","1","yes","Vulkan" +"Vulkan0","SSM_SCAN","type=f32,d_state=16,head_dim=1,n_head=1024,n_group=1,n_seq_tokens=32,n_seqs=4,xbc_overlap=0","support","0","no","Vulkan" +"Vulkan0","SSM_SCAN","type=f32,d_state=128,head_dim=64,n_head=16,n_group=2,n_seq_tokens=32,n_seqs=4,xbc_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SSM_SCAN","type=f32,d_state=256,head_dim=64,n_head=8,n_group=2,n_seq_tokens=32,n_seqs=4,xbc_overlap=0","support","1","yes","Vulkan" +"Vulkan0","SSM_SCAN","type=f32,d_state=128,head_dim=128,n_head=4,n_group=4,n_seq_tokens=16,n_seqs=2,xbc_overlap=1","support","1","yes","Vulkan" "Vulkan0","RWKV_WKV6","type=f32,head_count=32,head_size=64,n_seq_tokens=1,n_seqs=1","support","1","yes","Vulkan" "Vulkan0","RWKV_WKV6","type=f32,head_count=32,head_size=64,n_seq_tokens=32,n_seqs=1","support","1","yes","Vulkan" "Vulkan0","RWKV_WKV6","type=f32,head_count=32,head_size=64,n_seq_tokens=32,n_seqs=4","support","1","yes","Vulkan" @@ -5998,6 +6184,12 @@ "Vulkan0","GATED_LINEAR_ATTN","type=f32,head_count=32,head_size=64,n_seq_tokens=32,n_seqs=1","support","0","no","Vulkan" "Vulkan0","GATED_LINEAR_ATTN","type=f32,head_count=32,head_size=64,n_seq_tokens=32,n_seqs=4","support","0","no","Vulkan" "Vulkan0","GATED_LINEAR_ATTN","type=f32,head_count=32,head_size=64,n_seq_tokens=128,n_seqs=4","support","0","no","Vulkan" +"Vulkan0","MUL_MAT_HADAMARD","type_a=f32,type_b=f32,m=128,n=1,k=128,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_HADAMARD","type_a=f32,type_b=f32,m=64,n=1,k=64,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_HADAMARD","type_a=f32,type_b=f32,m=256,n=1,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_HADAMARD","type_a=f32,type_b=f32,m=512,n=1,k=512,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_HADAMARD","type_a=f32,type_b=f32,m=128,n=32,k=128,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_HADAMARD","type_a=f32,type_b=f32,m=128,n=4,k=128,bs=[2,3],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" "Vulkan0","MUL_MAT","type_a=f32,type_b=f32,m=16,n=1,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" "Vulkan0","MUL_MAT","type_a=f32,type_b=f32,m=16,n=2,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" "Vulkan0","MUL_MAT","type_a=f32,type_b=f32,m=16,n=3,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" @@ -6070,6 +6262,15 @@ "Vulkan0","MUL_MAT","type_a=q8_0,type_b=f32,m=16,n=7,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" "Vulkan0","MUL_MAT","type_a=q8_0,type_b=f32,m=16,n=8,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" "Vulkan0","MUL_MAT","type_a=q8_0,type_b=f32,m=16,n=9,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f32,m=16,n=1,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f32,m=16,n=2,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f32,m=16,n=3,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f32,m=16,n=4,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f32,m=16,n=5,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f32,m=16,n=6,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f32,m=16,n=7,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f32,m=16,n=8,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f32,m=16,n=9,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" "Vulkan0","MUL_MAT","type_a=mxfp4,type_b=f32,m=16,n=1,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" "Vulkan0","MUL_MAT","type_a=mxfp4,type_b=f32,m=16,n=2,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" "Vulkan0","MUL_MAT","type_a=mxfp4,type_b=f32,m=16,n=3,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" @@ -6079,6 +6280,15 @@ "Vulkan0","MUL_MAT","type_a=mxfp4,type_b=f32,m=16,n=7,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" "Vulkan0","MUL_MAT","type_a=mxfp4,type_b=f32,m=16,n=8,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" "Vulkan0","MUL_MAT","type_a=mxfp4,type_b=f32,m=16,n=9,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f32,m=16,n=1,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f32,m=16,n=2,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f32,m=16,n=3,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f32,m=16,n=4,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f32,m=16,n=5,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f32,m=16,n=6,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f32,m=16,n=7,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f32,m=16,n=8,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f32,m=16,n=9,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" "Vulkan0","MUL_MAT","type_a=q2_K,type_b=f32,m=16,n=1,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" "Vulkan0","MUL_MAT","type_a=q2_K,type_b=f32,m=16,n=2,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" "Vulkan0","MUL_MAT","type_a=q2_K,type_b=f32,m=16,n=3,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" @@ -6499,6 +6709,68 @@ "Vulkan0","MUL_MAT","type_a=q8_0,type_b=f16,m=16,n=8,k=1024,bs=[3,2],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" "Vulkan0","MUL_MAT","type_a=q8_0,type_b=f16,m=16,n=16,k=1024,bs=[3,2],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" "Vulkan0","MUL_MAT","type_a=q8_0,type_b=f16,m=16,n=8,k=256,bs=[1536,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f32,m=16,n=1,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f32,m=16,n=1,k=256,bs=[1,1],nr=[2,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f32,m=16,n=1,k=256,bs=[1,1],nr=[1,2],per=[0,1,2,3],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f32,m=16,n=1,k=256,bs=[3,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f32,m=16,n=1,k=256,bs=[3,1],nr=[2,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f32,m=16,n=1,k=256,bs=[3,2],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f32,m=16,n=1,k=256,bs=[3,2],nr=[2,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f32,m=16,n=1,k=256,bs=[3,2],nr=[1,2],per=[0,1,2,3],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f32,m=16,n=1,k=256,bs=[3,2],nr=[2,2],per=[0,1,2,3],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f32,m=16,n=16,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f32,m=16,n=16,k=256,bs=[1,1],nr=[2,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f32,m=16,n=16,k=256,bs=[1,1],nr=[1,2],per=[0,1,2,3],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f32,m=16,n=16,k=256,bs=[3,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f32,m=16,n=16,k=256,bs=[3,1],nr=[2,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f32,m=16,n=16,k=256,bs=[3,2],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f32,m=16,n=16,k=256,bs=[3,2],nr=[2,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f32,m=16,n=16,k=256,bs=[3,2],nr=[1,2],per=[0,1,2,3],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f32,m=16,n=16,k=256,bs=[3,2],nr=[2,2],per=[0,1,2,3],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f32,m=16,n=1,k=256,bs=[2,3],nr=[1,1],per=[0,2,1,3],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f32,m=16,n=1,k=256,bs=[2,3],nr=[1,1],per=[0,1,3,2],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f32,m=16,n=1,k=256,bs=[2,3],nr=[1,1],per=[0,3,2,1],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f32,m=16,n=8,k=256,bs=[2,3],nr=[1,1],per=[0,2,1,3],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f32,m=16,n=8,k=256,bs=[2,3],nr=[1,1],per=[0,1,3,2],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f32,m=16,n=8,k=256,bs=[2,3],nr=[1,1],per=[0,3,2,1],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f32,m=16,n=16,k=256,bs=[2,3],nr=[1,1],per=[0,2,1,3],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f32,m=16,n=16,k=256,bs=[2,3],nr=[1,1],per=[0,1,3,2],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f32,m=16,n=16,k=256,bs=[2,3],nr=[1,1],per=[0,3,2,1],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f32,m=16,n=1,k=1024,bs=[3,2],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f32,m=16,n=8,k=1024,bs=[3,2],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f32,m=16,n=16,k=1024,bs=[3,2],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f32,m=16,n=8,k=256,bs=[1536,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f16,m=16,n=1,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f16,m=16,n=1,k=256,bs=[1,1],nr=[2,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f16,m=16,n=1,k=256,bs=[1,1],nr=[1,2],per=[0,1,2,3],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f16,m=16,n=1,k=256,bs=[3,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f16,m=16,n=1,k=256,bs=[3,1],nr=[2,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f16,m=16,n=1,k=256,bs=[3,2],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f16,m=16,n=1,k=256,bs=[3,2],nr=[2,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f16,m=16,n=1,k=256,bs=[3,2],nr=[1,2],per=[0,1,2,3],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f16,m=16,n=1,k=256,bs=[3,2],nr=[2,2],per=[0,1,2,3],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f16,m=16,n=16,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f16,m=16,n=16,k=256,bs=[1,1],nr=[2,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f16,m=16,n=16,k=256,bs=[1,1],nr=[1,2],per=[0,1,2,3],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f16,m=16,n=16,k=256,bs=[3,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f16,m=16,n=16,k=256,bs=[3,1],nr=[2,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f16,m=16,n=16,k=256,bs=[3,2],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f16,m=16,n=16,k=256,bs=[3,2],nr=[2,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f16,m=16,n=16,k=256,bs=[3,2],nr=[1,2],per=[0,1,2,3],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f16,m=16,n=16,k=256,bs=[3,2],nr=[2,2],per=[0,1,2,3],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f16,m=16,n=1,k=256,bs=[2,3],nr=[1,1],per=[0,2,1,3],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f16,m=16,n=1,k=256,bs=[2,3],nr=[1,1],per=[0,1,3,2],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f16,m=16,n=1,k=256,bs=[2,3],nr=[1,1],per=[0,3,2,1],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f16,m=16,n=8,k=256,bs=[2,3],nr=[1,1],per=[0,2,1,3],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f16,m=16,n=8,k=256,bs=[2,3],nr=[1,1],per=[0,1,3,2],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f16,m=16,n=8,k=256,bs=[2,3],nr=[1,1],per=[0,3,2,1],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f16,m=16,n=16,k=256,bs=[2,3],nr=[1,1],per=[0,2,1,3],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f16,m=16,n=16,k=256,bs=[2,3],nr=[1,1],per=[0,1,3,2],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f16,m=16,n=16,k=256,bs=[2,3],nr=[1,1],per=[0,3,2,1],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f16,m=16,n=1,k=1024,bs=[3,2],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f16,m=16,n=8,k=1024,bs=[3,2],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f16,m=16,n=16,k=1024,bs=[3,2],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f16,m=16,n=8,k=256,bs=[1536,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" "Vulkan0","MUL_MAT","type_a=q4_0,type_b=f32,m=16,n=1,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" "Vulkan0","MUL_MAT","type_a=q4_0,type_b=f32,m=16,n=1,k=256,bs=[1,1],nr=[2,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" "Vulkan0","MUL_MAT","type_a=q4_0,type_b=f32,m=16,n=1,k=256,bs=[1,1],nr=[1,2],per=[0,1,2,3],k_v=0,o=1","support","0","no","Vulkan" @@ -6747,6 +7019,68 @@ "Vulkan0","MUL_MAT","type_a=mxfp4,type_b=f16,m=16,n=8,k=1024,bs=[3,2],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" "Vulkan0","MUL_MAT","type_a=mxfp4,type_b=f16,m=16,n=16,k=1024,bs=[3,2],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" "Vulkan0","MUL_MAT","type_a=mxfp4,type_b=f16,m=16,n=8,k=256,bs=[1536,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f32,m=16,n=1,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f32,m=16,n=1,k=256,bs=[1,1],nr=[2,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f32,m=16,n=1,k=256,bs=[1,1],nr=[1,2],per=[0,1,2,3],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f32,m=16,n=1,k=256,bs=[3,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f32,m=16,n=1,k=256,bs=[3,1],nr=[2,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f32,m=16,n=1,k=256,bs=[3,2],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f32,m=16,n=1,k=256,bs=[3,2],nr=[2,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f32,m=16,n=1,k=256,bs=[3,2],nr=[1,2],per=[0,1,2,3],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f32,m=16,n=1,k=256,bs=[3,2],nr=[2,2],per=[0,1,2,3],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f32,m=16,n=16,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f32,m=16,n=16,k=256,bs=[1,1],nr=[2,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f32,m=16,n=16,k=256,bs=[1,1],nr=[1,2],per=[0,1,2,3],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f32,m=16,n=16,k=256,bs=[3,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f32,m=16,n=16,k=256,bs=[3,1],nr=[2,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f32,m=16,n=16,k=256,bs=[3,2],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f32,m=16,n=16,k=256,bs=[3,2],nr=[2,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f32,m=16,n=16,k=256,bs=[3,2],nr=[1,2],per=[0,1,2,3],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f32,m=16,n=16,k=256,bs=[3,2],nr=[2,2],per=[0,1,2,3],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f32,m=16,n=1,k=256,bs=[2,3],nr=[1,1],per=[0,2,1,3],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f32,m=16,n=1,k=256,bs=[2,3],nr=[1,1],per=[0,1,3,2],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f32,m=16,n=1,k=256,bs=[2,3],nr=[1,1],per=[0,3,2,1],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f32,m=16,n=8,k=256,bs=[2,3],nr=[1,1],per=[0,2,1,3],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f32,m=16,n=8,k=256,bs=[2,3],nr=[1,1],per=[0,1,3,2],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f32,m=16,n=8,k=256,bs=[2,3],nr=[1,1],per=[0,3,2,1],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f32,m=16,n=16,k=256,bs=[2,3],nr=[1,1],per=[0,2,1,3],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f32,m=16,n=16,k=256,bs=[2,3],nr=[1,1],per=[0,1,3,2],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f32,m=16,n=16,k=256,bs=[2,3],nr=[1,1],per=[0,3,2,1],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f32,m=16,n=1,k=1024,bs=[3,2],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f32,m=16,n=8,k=1024,bs=[3,2],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f32,m=16,n=16,k=1024,bs=[3,2],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f32,m=16,n=8,k=256,bs=[1536,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f16,m=16,n=1,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f16,m=16,n=1,k=256,bs=[1,1],nr=[2,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f16,m=16,n=1,k=256,bs=[1,1],nr=[1,2],per=[0,1,2,3],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f16,m=16,n=1,k=256,bs=[3,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f16,m=16,n=1,k=256,bs=[3,1],nr=[2,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f16,m=16,n=1,k=256,bs=[3,2],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f16,m=16,n=1,k=256,bs=[3,2],nr=[2,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f16,m=16,n=1,k=256,bs=[3,2],nr=[1,2],per=[0,1,2,3],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f16,m=16,n=1,k=256,bs=[3,2],nr=[2,2],per=[0,1,2,3],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f16,m=16,n=16,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f16,m=16,n=16,k=256,bs=[1,1],nr=[2,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f16,m=16,n=16,k=256,bs=[1,1],nr=[1,2],per=[0,1,2,3],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f16,m=16,n=16,k=256,bs=[3,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f16,m=16,n=16,k=256,bs=[3,1],nr=[2,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f16,m=16,n=16,k=256,bs=[3,2],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f16,m=16,n=16,k=256,bs=[3,2],nr=[2,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f16,m=16,n=16,k=256,bs=[3,2],nr=[1,2],per=[0,1,2,3],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f16,m=16,n=16,k=256,bs=[3,2],nr=[2,2],per=[0,1,2,3],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f16,m=16,n=1,k=256,bs=[2,3],nr=[1,1],per=[0,2,1,3],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f16,m=16,n=1,k=256,bs=[2,3],nr=[1,1],per=[0,1,3,2],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f16,m=16,n=1,k=256,bs=[2,3],nr=[1,1],per=[0,3,2,1],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f16,m=16,n=8,k=256,bs=[2,3],nr=[1,1],per=[0,2,1,3],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f16,m=16,n=8,k=256,bs=[2,3],nr=[1,1],per=[0,1,3,2],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f16,m=16,n=8,k=256,bs=[2,3],nr=[1,1],per=[0,3,2,1],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f16,m=16,n=16,k=256,bs=[2,3],nr=[1,1],per=[0,2,1,3],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f16,m=16,n=16,k=256,bs=[2,3],nr=[1,1],per=[0,1,3,2],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f16,m=16,n=16,k=256,bs=[2,3],nr=[1,1],per=[0,3,2,1],k_v=0,o=1","support","0","no","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f16,m=16,n=1,k=1024,bs=[3,2],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f16,m=16,n=8,k=1024,bs=[3,2],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f16,m=16,n=16,k=1024,bs=[3,2],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f16,m=16,n=8,k=256,bs=[1536,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" "Vulkan0","MUL_MAT","type_a=iq2_xxs,type_b=f32,m=16,n=1,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" "Vulkan0","MUL_MAT","type_a=iq2_xxs,type_b=f32,m=16,n=1,k=256,bs=[1,1],nr=[2,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" "Vulkan0","MUL_MAT","type_a=iq2_xxs,type_b=f32,m=16,n=1,k=256,bs=[1,1],nr=[1,2],per=[0,1,2,3],k_v=0,o=1","support","0","no","Vulkan" @@ -6809,6 +7143,15 @@ "Vulkan0","MUL_MAT","type_a=iq2_xxs,type_b=f16,m=16,n=8,k=1024,bs=[3,2],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" "Vulkan0","MUL_MAT","type_a=iq2_xxs,type_b=f16,m=16,n=16,k=1024,bs=[3,2],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" "Vulkan0","MUL_MAT","type_a=iq2_xxs,type_b=f16,m=16,n=8,k=256,bs=[1536,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=bf16,type_b=f32,m=16,n=1,k=256,bs=[2,3],nr=[1,1],per=[0,2,1,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=bf16,type_b=f32,m=16,n=1,k=256,bs=[2,3],nr=[1,1],per=[0,1,3,2],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=bf16,type_b=f32,m=16,n=1,k=256,bs=[2,3],nr=[1,1],per=[0,3,2,1],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=bf16,type_b=f32,m=16,n=8,k=256,bs=[2,3],nr=[1,1],per=[0,2,1,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=bf16,type_b=f32,m=16,n=8,k=256,bs=[2,3],nr=[1,1],per=[0,1,3,2],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=bf16,type_b=f32,m=16,n=8,k=256,bs=[2,3],nr=[1,1],per=[0,3,2,1],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=bf16,type_b=f32,m=16,n=16,k=256,bs=[2,3],nr=[1,1],per=[0,2,1,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=bf16,type_b=f32,m=16,n=16,k=256,bs=[2,3],nr=[1,1],per=[0,1,3,2],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=bf16,type_b=f32,m=16,n=16,k=256,bs=[2,3],nr=[1,1],per=[0,3,2,1],k_v=0,o=1","support","1","yes","Vulkan" "Vulkan0","MUL_MAT","type_a=q4_1,type_b=f32,m=16,n=1,k=32,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" "Vulkan0","MUL_MAT","type_a=q4_1,type_b=f32,m=16,n=1,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" "Vulkan0","MUL_MAT","type_a=q5_0,type_b=f32,m=16,n=1,k=32,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" @@ -6817,6 +7160,8 @@ "Vulkan0","MUL_MAT","type_a=q5_1,type_b=f32,m=16,n=1,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" "Vulkan0","MUL_MAT","type_a=q8_0,type_b=f32,m=16,n=1,k=32,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" "Vulkan0","MUL_MAT","type_a=q8_0,type_b=f32,m=16,n=1,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f32,m=16,n=1,k=128,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f32,m=16,n=1,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" "Vulkan0","MUL_MAT","type_a=q2_K,type_b=f32,m=16,n=1,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" "Vulkan0","MUL_MAT","type_a=q3_K,type_b=f32,m=16,n=1,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" "Vulkan0","MUL_MAT","type_a=q5_K,type_b=f32,m=16,n=1,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" @@ -6841,10 +7186,6 @@ "Vulkan0","MUL_MAT","type_a=f16,type_b=f32,m=1056,n=1,k=193,bs=[1,1],nr=[4,1],per=[0,2,1,3],k_v=0,o=1","support","1","yes","Vulkan" "Vulkan0","MUL_MAT","type_a=f16,type_b=f32,m=1056,n=1,k=67,bs=[1,1],nr=[4,1],per=[0,2,1,3],k_v=0,o=1","support","1","yes","Vulkan" "Vulkan0","MUL_MAT","type_a=f32,type_b=f32,m=64,n=77,k=77,bs=[12,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" -"Vulkan0","MUL_MAT","type_a=f16,type_b=f32,m=2,n=1,k=3,bs=[128,1024],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" -"Vulkan0","MUL_MAT","type_a=f16,type_b=f32,m=2,n=3,k=4,bs=[128,1024],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" -"Vulkan0","MUL_MAT","type_a=f16,type_b=f32,m=2,n=1,k=3,bs=[131072,1],nr=[1,1],per=[0,2,1,3],k_v=0,o=1","support","1","yes","Vulkan" -"Vulkan0","MUL_MAT","type_a=f16,type_b=f32,m=2,n=1,k=3,bs=[131072,1],nr=[1,1],per=[0,1,2,3],k_v=64,o=1","support","1","yes","Vulkan" "Vulkan0","MUL_MAT","type_a=q4_0,type_b=f32,m=576,n=512,k=576,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" "Vulkan0","MUL_MAT","type_a=q4_0,type_b=f32,m=1,n=2048,k=8192,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" "Vulkan0","MUL_MAT","type_a=f32,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" @@ -6855,7 +7196,9 @@ "Vulkan0","MUL_MAT","type_a=q5_0,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" "Vulkan0","MUL_MAT","type_a=q5_1,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" "Vulkan0","MUL_MAT","type_a=q8_0,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=q1_0,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" "Vulkan0","MUL_MAT","type_a=mxfp4,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT","type_a=nvfp4,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" "Vulkan0","MUL_MAT","type_a=q2_K,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" "Vulkan0","MUL_MAT","type_a=q3_K,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" "Vulkan0","MUL_MAT","type_a=q4_K,type_b=f32,m=1,n=64,k=256,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1","support","1","yes","Vulkan" @@ -7479,6 +7822,78 @@ "Vulkan0","MUL_MAT_ID","type_a=q8_0,type_b=f32,n_mats=8,n_used=4,b=1,m=512,n=17,k=256","support","1","yes","Vulkan" "Vulkan0","MUL_MAT_ID","type_a=q8_0,type_b=f32,n_mats=8,n_used=4,b=1,m=512,n=32,k=256","support","1","yes","Vulkan" "Vulkan0","MUL_MAT_ID","type_a=q8_0,type_b=f32,n_mats=8,n_used=4,b=1,m=512,n=129,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=4,n_used=1,b=0,m=512,n=1,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=4,n_used=1,b=0,m=512,n=4,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=4,n_used=1,b=0,m=512,n=5,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=4,n_used=1,b=0,m=512,n=17,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=4,n_used=1,b=0,m=512,n=32,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=4,n_used=1,b=0,m=512,n=129,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=4,n_used=1,b=1,m=512,n=1,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=4,n_used=1,b=1,m=512,n=4,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=4,n_used=1,b=1,m=512,n=5,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=4,n_used=1,b=1,m=512,n=17,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=4,n_used=1,b=1,m=512,n=32,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=4,n_used=1,b=1,m=512,n=129,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=4,n_used=2,b=0,m=512,n=1,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=4,n_used=2,b=0,m=512,n=4,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=4,n_used=2,b=0,m=512,n=5,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=4,n_used=2,b=0,m=512,n=17,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=4,n_used=2,b=0,m=512,n=32,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=4,n_used=2,b=0,m=512,n=129,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=4,n_used=2,b=1,m=512,n=1,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=4,n_used=2,b=1,m=512,n=4,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=4,n_used=2,b=1,m=512,n=5,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=4,n_used=2,b=1,m=512,n=17,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=4,n_used=2,b=1,m=512,n=32,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=4,n_used=2,b=1,m=512,n=129,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=4,n_used=4,b=0,m=512,n=1,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=4,n_used=4,b=0,m=512,n=4,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=4,n_used=4,b=0,m=512,n=5,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=4,n_used=4,b=0,m=512,n=17,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=4,n_used=4,b=0,m=512,n=32,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=4,n_used=4,b=0,m=512,n=129,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=4,n_used=4,b=1,m=512,n=1,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=4,n_used=4,b=1,m=512,n=4,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=4,n_used=4,b=1,m=512,n=5,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=4,n_used=4,b=1,m=512,n=17,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=4,n_used=4,b=1,m=512,n=32,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=4,n_used=4,b=1,m=512,n=129,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=8,n_used=1,b=0,m=512,n=1,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=8,n_used=1,b=0,m=512,n=4,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=8,n_used=1,b=0,m=512,n=5,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=8,n_used=1,b=0,m=512,n=17,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=8,n_used=1,b=0,m=512,n=32,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=8,n_used=1,b=0,m=512,n=129,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=8,n_used=1,b=1,m=512,n=1,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=8,n_used=1,b=1,m=512,n=4,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=8,n_used=1,b=1,m=512,n=5,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=8,n_used=1,b=1,m=512,n=17,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=8,n_used=1,b=1,m=512,n=32,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=8,n_used=1,b=1,m=512,n=129,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=8,n_used=2,b=0,m=512,n=1,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=8,n_used=2,b=0,m=512,n=4,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=8,n_used=2,b=0,m=512,n=5,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=8,n_used=2,b=0,m=512,n=17,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=8,n_used=2,b=0,m=512,n=32,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=8,n_used=2,b=0,m=512,n=129,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=8,n_used=2,b=1,m=512,n=1,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=8,n_used=2,b=1,m=512,n=4,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=8,n_used=2,b=1,m=512,n=5,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=8,n_used=2,b=1,m=512,n=17,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=8,n_used=2,b=1,m=512,n=32,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=8,n_used=2,b=1,m=512,n=129,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=8,n_used=4,b=0,m=512,n=1,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=8,n_used=4,b=0,m=512,n=4,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=8,n_used=4,b=0,m=512,n=5,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=8,n_used=4,b=0,m=512,n=17,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=8,n_used=4,b=0,m=512,n=32,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=8,n_used=4,b=0,m=512,n=129,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=8,n_used=4,b=1,m=512,n=1,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=8,n_used=4,b=1,m=512,n=4,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=8,n_used=4,b=1,m=512,n=5,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=8,n_used=4,b=1,m=512,n=17,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=8,n_used=4,b=1,m=512,n=32,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=8,n_used=4,b=1,m=512,n=129,k=256","support","1","yes","Vulkan" "Vulkan0","MUL_MAT_ID","type_a=q4_0,type_b=f32,n_mats=4,n_used=1,b=0,m=512,n=1,k=256","support","1","yes","Vulkan" "Vulkan0","MUL_MAT_ID","type_a=q4_0,type_b=f32,n_mats=4,n_used=1,b=0,m=512,n=4,k=256","support","1","yes","Vulkan" "Vulkan0","MUL_MAT_ID","type_a=q4_0,type_b=f32,n_mats=4,n_used=1,b=0,m=512,n=5,k=256","support","1","yes","Vulkan" @@ -7767,6 +8182,78 @@ "Vulkan0","MUL_MAT_ID","type_a=mxfp4,type_b=f32,n_mats=8,n_used=4,b=1,m=512,n=17,k=256","support","1","yes","Vulkan" "Vulkan0","MUL_MAT_ID","type_a=mxfp4,type_b=f32,n_mats=8,n_used=4,b=1,m=512,n=32,k=256","support","1","yes","Vulkan" "Vulkan0","MUL_MAT_ID","type_a=mxfp4,type_b=f32,n_mats=8,n_used=4,b=1,m=512,n=129,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=4,n_used=1,b=0,m=512,n=1,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=4,n_used=1,b=0,m=512,n=4,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=4,n_used=1,b=0,m=512,n=5,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=4,n_used=1,b=0,m=512,n=17,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=4,n_used=1,b=0,m=512,n=32,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=4,n_used=1,b=0,m=512,n=129,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=4,n_used=1,b=1,m=512,n=1,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=4,n_used=1,b=1,m=512,n=4,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=4,n_used=1,b=1,m=512,n=5,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=4,n_used=1,b=1,m=512,n=17,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=4,n_used=1,b=1,m=512,n=32,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=4,n_used=1,b=1,m=512,n=129,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=4,n_used=2,b=0,m=512,n=1,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=4,n_used=2,b=0,m=512,n=4,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=4,n_used=2,b=0,m=512,n=5,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=4,n_used=2,b=0,m=512,n=17,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=4,n_used=2,b=0,m=512,n=32,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=4,n_used=2,b=0,m=512,n=129,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=4,n_used=2,b=1,m=512,n=1,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=4,n_used=2,b=1,m=512,n=4,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=4,n_used=2,b=1,m=512,n=5,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=4,n_used=2,b=1,m=512,n=17,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=4,n_used=2,b=1,m=512,n=32,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=4,n_used=2,b=1,m=512,n=129,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=4,n_used=4,b=0,m=512,n=1,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=4,n_used=4,b=0,m=512,n=4,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=4,n_used=4,b=0,m=512,n=5,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=4,n_used=4,b=0,m=512,n=17,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=4,n_used=4,b=0,m=512,n=32,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=4,n_used=4,b=0,m=512,n=129,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=4,n_used=4,b=1,m=512,n=1,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=4,n_used=4,b=1,m=512,n=4,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=4,n_used=4,b=1,m=512,n=5,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=4,n_used=4,b=1,m=512,n=17,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=4,n_used=4,b=1,m=512,n=32,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=4,n_used=4,b=1,m=512,n=129,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=8,n_used=1,b=0,m=512,n=1,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=8,n_used=1,b=0,m=512,n=4,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=8,n_used=1,b=0,m=512,n=5,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=8,n_used=1,b=0,m=512,n=17,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=8,n_used=1,b=0,m=512,n=32,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=8,n_used=1,b=0,m=512,n=129,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=8,n_used=1,b=1,m=512,n=1,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=8,n_used=1,b=1,m=512,n=4,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=8,n_used=1,b=1,m=512,n=5,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=8,n_used=1,b=1,m=512,n=17,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=8,n_used=1,b=1,m=512,n=32,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=8,n_used=1,b=1,m=512,n=129,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=8,n_used=2,b=0,m=512,n=1,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=8,n_used=2,b=0,m=512,n=4,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=8,n_used=2,b=0,m=512,n=5,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=8,n_used=2,b=0,m=512,n=17,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=8,n_used=2,b=0,m=512,n=32,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=8,n_used=2,b=0,m=512,n=129,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=8,n_used=2,b=1,m=512,n=1,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=8,n_used=2,b=1,m=512,n=4,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=8,n_used=2,b=1,m=512,n=5,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=8,n_used=2,b=1,m=512,n=17,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=8,n_used=2,b=1,m=512,n=32,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=8,n_used=2,b=1,m=512,n=129,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=8,n_used=4,b=0,m=512,n=1,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=8,n_used=4,b=0,m=512,n=4,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=8,n_used=4,b=0,m=512,n=5,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=8,n_used=4,b=0,m=512,n=17,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=8,n_used=4,b=0,m=512,n=32,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=8,n_used=4,b=0,m=512,n=129,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=8,n_used=4,b=1,m=512,n=1,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=8,n_used=4,b=1,m=512,n=4,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=8,n_used=4,b=1,m=512,n=5,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=8,n_used=4,b=1,m=512,n=17,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=8,n_used=4,b=1,m=512,n=32,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=nvfp4,type_b=f32,n_mats=8,n_used=4,b=1,m=512,n=129,k=256","support","1","yes","Vulkan" "Vulkan0","MUL_MAT_ID","type_a=iq2_xxs,type_b=f32,n_mats=4,n_used=1,b=0,m=512,n=1,k=256","support","1","yes","Vulkan" "Vulkan0","MUL_MAT_ID","type_a=iq2_xxs,type_b=f32,n_mats=4,n_used=1,b=0,m=512,n=4,k=256","support","1","yes","Vulkan" "Vulkan0","MUL_MAT_ID","type_a=iq2_xxs,type_b=f32,n_mats=4,n_used=1,b=0,m=512,n=5,k=256","support","1","yes","Vulkan" @@ -7847,6 +8334,8 @@ "Vulkan0","MUL_MAT_ID","type_a=q5_1,type_b=f32,n_mats=4,n_used=2,b=0,m=512,n=32,k=256","support","1","yes","Vulkan" "Vulkan0","MUL_MAT_ID","type_a=q8_0,type_b=f32,n_mats=4,n_used=2,b=0,m=512,n=1,k=256","support","1","yes","Vulkan" "Vulkan0","MUL_MAT_ID","type_a=q8_0,type_b=f32,n_mats=4,n_used=2,b=0,m=512,n=32,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=4,n_used=2,b=0,m=512,n=1,k=256","support","1","yes","Vulkan" +"Vulkan0","MUL_MAT_ID","type_a=q1_0,type_b=f32,n_mats=4,n_used=2,b=0,m=512,n=32,k=256","support","1","yes","Vulkan" "Vulkan0","MUL_MAT_ID","type_a=q2_K,type_b=f32,n_mats=4,n_used=2,b=0,m=512,n=1,k=256","support","1","yes","Vulkan" "Vulkan0","MUL_MAT_ID","type_a=q2_K,type_b=f32,n_mats=4,n_used=2,b=0,m=512,n=32,k=256","support","1","yes","Vulkan" "Vulkan0","MUL_MAT_ID","type_a=q3_K,type_b=f32,n_mats=4,n_used=2,b=0,m=512,n=1,k=256","support","1","yes","Vulkan" @@ -8257,6 +8746,134 @@ "Vulkan0","OUT_PROD","type_a=q8_0,type_b=f16,m=256,n=16,k=16,bs=[3,3],nr=[1,2],trans_b=0","support","0","no","Vulkan" "Vulkan0","OUT_PROD","type_a=q8_0,type_b=f16,m=256,n=16,k=16,bs=[3,3],nr=[2,1],trans_b=0","support","0","no","Vulkan" "Vulkan0","OUT_PROD","type_a=q8_0,type_b=f16,m=256,n=16,k=16,bs=[3,3],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=1,k=1,bs=[1,1],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=1,k=1,bs=[1,1],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=1,k=1,bs=[1,1],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=1,k=1,bs=[1,1],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=1,k=1,bs=[1,3],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=1,k=1,bs=[1,3],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=1,k=1,bs=[1,3],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=1,k=1,bs=[1,3],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=1,k=1,bs=[3,1],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=1,k=1,bs=[3,1],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=1,k=1,bs=[3,1],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=1,k=1,bs=[3,1],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=1,k=1,bs=[3,3],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=1,k=1,bs=[3,3],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=1,k=1,bs=[3,3],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=1,k=1,bs=[3,3],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=1,k=16,bs=[1,1],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=1,k=16,bs=[1,1],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=1,k=16,bs=[1,1],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=1,k=16,bs=[1,1],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=1,k=16,bs=[1,3],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=1,k=16,bs=[1,3],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=1,k=16,bs=[1,3],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=1,k=16,bs=[1,3],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=1,k=16,bs=[3,1],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=1,k=16,bs=[3,1],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=1,k=16,bs=[3,1],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=1,k=16,bs=[3,1],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=1,k=16,bs=[3,3],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=1,k=16,bs=[3,3],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=1,k=16,bs=[3,3],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=1,k=16,bs=[3,3],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=16,k=1,bs=[1,1],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=16,k=1,bs=[1,1],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=16,k=1,bs=[1,1],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=16,k=1,bs=[1,1],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=16,k=1,bs=[1,3],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=16,k=1,bs=[1,3],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=16,k=1,bs=[1,3],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=16,k=1,bs=[1,3],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=16,k=1,bs=[3,1],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=16,k=1,bs=[3,1],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=16,k=1,bs=[3,1],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=16,k=1,bs=[3,1],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=16,k=1,bs=[3,3],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=16,k=1,bs=[3,3],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=16,k=1,bs=[3,3],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=16,k=1,bs=[3,3],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=16,k=16,bs=[1,1],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=16,k=16,bs=[1,1],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=16,k=16,bs=[1,1],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=16,k=16,bs=[1,1],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=16,k=16,bs=[1,3],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=16,k=16,bs=[1,3],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=16,k=16,bs=[1,3],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=16,k=16,bs=[1,3],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=16,k=16,bs=[3,1],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=16,k=16,bs=[3,1],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=16,k=16,bs=[3,1],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=16,k=16,bs=[3,1],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=16,k=16,bs=[3,3],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=16,k=16,bs=[3,3],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=16,k=16,bs=[3,3],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f32,m=256,n=16,k=16,bs=[3,3],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=1,k=1,bs=[1,1],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=1,k=1,bs=[1,1],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=1,k=1,bs=[1,1],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=1,k=1,bs=[1,1],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=1,k=1,bs=[1,3],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=1,k=1,bs=[1,3],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=1,k=1,bs=[1,3],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=1,k=1,bs=[1,3],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=1,k=1,bs=[3,1],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=1,k=1,bs=[3,1],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=1,k=1,bs=[3,1],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=1,k=1,bs=[3,1],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=1,k=1,bs=[3,3],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=1,k=1,bs=[3,3],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=1,k=1,bs=[3,3],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=1,k=1,bs=[3,3],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=1,k=16,bs=[1,1],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=1,k=16,bs=[1,1],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=1,k=16,bs=[1,1],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=1,k=16,bs=[1,1],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=1,k=16,bs=[1,3],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=1,k=16,bs=[1,3],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=1,k=16,bs=[1,3],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=1,k=16,bs=[1,3],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=1,k=16,bs=[3,1],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=1,k=16,bs=[3,1],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=1,k=16,bs=[3,1],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=1,k=16,bs=[3,1],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=1,k=16,bs=[3,3],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=1,k=16,bs=[3,3],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=1,k=16,bs=[3,3],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=1,k=16,bs=[3,3],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=16,k=1,bs=[1,1],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=16,k=1,bs=[1,1],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=16,k=1,bs=[1,1],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=16,k=1,bs=[1,1],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=16,k=1,bs=[1,3],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=16,k=1,bs=[1,3],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=16,k=1,bs=[1,3],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=16,k=1,bs=[1,3],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=16,k=1,bs=[3,1],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=16,k=1,bs=[3,1],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=16,k=1,bs=[3,1],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=16,k=1,bs=[3,1],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=16,k=1,bs=[3,3],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=16,k=1,bs=[3,3],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=16,k=1,bs=[3,3],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=16,k=1,bs=[3,3],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=16,k=16,bs=[1,1],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=16,k=16,bs=[1,1],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=16,k=16,bs=[1,1],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=16,k=16,bs=[1,1],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=16,k=16,bs=[1,3],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=16,k=16,bs=[1,3],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=16,k=16,bs=[1,3],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=16,k=16,bs=[1,3],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=16,k=16,bs=[3,1],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=16,k=16,bs=[3,1],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=16,k=16,bs=[3,1],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=16,k=16,bs=[3,1],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=16,k=16,bs=[3,3],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=16,k=16,bs=[3,3],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=16,k=16,bs=[3,3],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=q1_0,type_b=f16,m=256,n=16,k=16,bs=[3,3],nr=[2,2],trans_b=0","support","0","no","Vulkan" "Vulkan0","OUT_PROD","type_a=q4_0,type_b=f32,m=256,n=1,k=1,bs=[1,1],nr=[1,1],trans_b=0","support","0","no","Vulkan" "Vulkan0","OUT_PROD","type_a=q4_0,type_b=f32,m=256,n=1,k=1,bs=[1,1],nr=[1,2],trans_b=0","support","0","no","Vulkan" "Vulkan0","OUT_PROD","type_a=q4_0,type_b=f32,m=256,n=1,k=1,bs=[1,1],nr=[2,1],trans_b=0","support","0","no","Vulkan" @@ -8769,6 +9386,134 @@ "Vulkan0","OUT_PROD","type_a=mxfp4,type_b=f16,m=256,n=16,k=16,bs=[3,3],nr=[1,2],trans_b=0","support","0","no","Vulkan" "Vulkan0","OUT_PROD","type_a=mxfp4,type_b=f16,m=256,n=16,k=16,bs=[3,3],nr=[2,1],trans_b=0","support","0","no","Vulkan" "Vulkan0","OUT_PROD","type_a=mxfp4,type_b=f16,m=256,n=16,k=16,bs=[3,3],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=1,k=1,bs=[1,1],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=1,k=1,bs=[1,1],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=1,k=1,bs=[1,1],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=1,k=1,bs=[1,1],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=1,k=1,bs=[1,3],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=1,k=1,bs=[1,3],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=1,k=1,bs=[1,3],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=1,k=1,bs=[1,3],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=1,k=1,bs=[3,1],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=1,k=1,bs=[3,1],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=1,k=1,bs=[3,1],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=1,k=1,bs=[3,1],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=1,k=1,bs=[3,3],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=1,k=1,bs=[3,3],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=1,k=1,bs=[3,3],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=1,k=1,bs=[3,3],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=1,k=16,bs=[1,1],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=1,k=16,bs=[1,1],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=1,k=16,bs=[1,1],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=1,k=16,bs=[1,1],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=1,k=16,bs=[1,3],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=1,k=16,bs=[1,3],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=1,k=16,bs=[1,3],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=1,k=16,bs=[1,3],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=1,k=16,bs=[3,1],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=1,k=16,bs=[3,1],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=1,k=16,bs=[3,1],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=1,k=16,bs=[3,1],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=1,k=16,bs=[3,3],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=1,k=16,bs=[3,3],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=1,k=16,bs=[3,3],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=1,k=16,bs=[3,3],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=16,k=1,bs=[1,1],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=16,k=1,bs=[1,1],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=16,k=1,bs=[1,1],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=16,k=1,bs=[1,1],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=16,k=1,bs=[1,3],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=16,k=1,bs=[1,3],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=16,k=1,bs=[1,3],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=16,k=1,bs=[1,3],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=16,k=1,bs=[3,1],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=16,k=1,bs=[3,1],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=16,k=1,bs=[3,1],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=16,k=1,bs=[3,1],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=16,k=1,bs=[3,3],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=16,k=1,bs=[3,3],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=16,k=1,bs=[3,3],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=16,k=1,bs=[3,3],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=16,k=16,bs=[1,1],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=16,k=16,bs=[1,1],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=16,k=16,bs=[1,1],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=16,k=16,bs=[1,1],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=16,k=16,bs=[1,3],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=16,k=16,bs=[1,3],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=16,k=16,bs=[1,3],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=16,k=16,bs=[1,3],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=16,k=16,bs=[3,1],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=16,k=16,bs=[3,1],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=16,k=16,bs=[3,1],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=16,k=16,bs=[3,1],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=16,k=16,bs=[3,3],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=16,k=16,bs=[3,3],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=16,k=16,bs=[3,3],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f32,m=256,n=16,k=16,bs=[3,3],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=1,k=1,bs=[1,1],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=1,k=1,bs=[1,1],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=1,k=1,bs=[1,1],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=1,k=1,bs=[1,1],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=1,k=1,bs=[1,3],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=1,k=1,bs=[1,3],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=1,k=1,bs=[1,3],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=1,k=1,bs=[1,3],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=1,k=1,bs=[3,1],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=1,k=1,bs=[3,1],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=1,k=1,bs=[3,1],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=1,k=1,bs=[3,1],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=1,k=1,bs=[3,3],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=1,k=1,bs=[3,3],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=1,k=1,bs=[3,3],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=1,k=1,bs=[3,3],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=1,k=16,bs=[1,1],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=1,k=16,bs=[1,1],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=1,k=16,bs=[1,1],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=1,k=16,bs=[1,1],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=1,k=16,bs=[1,3],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=1,k=16,bs=[1,3],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=1,k=16,bs=[1,3],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=1,k=16,bs=[1,3],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=1,k=16,bs=[3,1],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=1,k=16,bs=[3,1],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=1,k=16,bs=[3,1],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=1,k=16,bs=[3,1],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=1,k=16,bs=[3,3],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=1,k=16,bs=[3,3],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=1,k=16,bs=[3,3],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=1,k=16,bs=[3,3],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=16,k=1,bs=[1,1],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=16,k=1,bs=[1,1],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=16,k=1,bs=[1,1],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=16,k=1,bs=[1,1],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=16,k=1,bs=[1,3],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=16,k=1,bs=[1,3],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=16,k=1,bs=[1,3],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=16,k=1,bs=[1,3],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=16,k=1,bs=[3,1],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=16,k=1,bs=[3,1],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=16,k=1,bs=[3,1],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=16,k=1,bs=[3,1],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=16,k=1,bs=[3,3],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=16,k=1,bs=[3,3],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=16,k=1,bs=[3,3],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=16,k=1,bs=[3,3],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=16,k=16,bs=[1,1],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=16,k=16,bs=[1,1],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=16,k=16,bs=[1,1],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=16,k=16,bs=[1,1],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=16,k=16,bs=[1,3],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=16,k=16,bs=[1,3],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=16,k=16,bs=[1,3],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=16,k=16,bs=[1,3],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=16,k=16,bs=[3,1],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=16,k=16,bs=[3,1],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=16,k=16,bs=[3,1],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=16,k=16,bs=[3,1],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=16,k=16,bs=[3,3],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=16,k=16,bs=[3,3],nr=[1,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=16,k=16,bs=[3,3],nr=[2,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=nvfp4,type_b=f16,m=256,n=16,k=16,bs=[3,3],nr=[2,2],trans_b=0","support","0","no","Vulkan" "Vulkan0","OUT_PROD","type_a=iq2_xxs,type_b=f32,m=256,n=1,k=1,bs=[1,1],nr=[1,1],trans_b=0","support","0","no","Vulkan" "Vulkan0","OUT_PROD","type_a=iq2_xxs,type_b=f32,m=256,n=1,k=1,bs=[1,1],nr=[1,2],trans_b=0","support","0","no","Vulkan" "Vulkan0","OUT_PROD","type_a=iq2_xxs,type_b=f32,m=256,n=1,k=1,bs=[1,1],nr=[2,1],trans_b=0","support","0","no","Vulkan" @@ -8897,6 +9642,10 @@ "Vulkan0","OUT_PROD","type_a=iq2_xxs,type_b=f16,m=256,n=16,k=16,bs=[3,3],nr=[1,2],trans_b=0","support","0","no","Vulkan" "Vulkan0","OUT_PROD","type_a=iq2_xxs,type_b=f16,m=256,n=16,k=16,bs=[3,3],nr=[2,1],trans_b=0","support","0","no","Vulkan" "Vulkan0","OUT_PROD","type_a=iq2_xxs,type_b=f16,m=256,n=16,k=16,bs=[3,3],nr=[2,2],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=f32,type_b=f32,m=256,n=16,k=16,bs=[1,1],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=f32,type_b=f32,m=256,n=16,k=16,bs=[8,1],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=f32,type_b=f32,m=256,n=16,k=16,bs=[16,1],nr=[1,1],trans_b=0","support","0","no","Vulkan" +"Vulkan0","OUT_PROD","type_a=f32,type_b=f32,m=256,n=16,k=16,bs=[32,1],nr=[1,1],trans_b=0","support","0","no","Vulkan" "Vulkan0","ADD_ID","type_a=f32,type_b=f32,n_embd=32,n_experts=4,n_experts_used=1,n_token=1","support","1","yes","Vulkan" "Vulkan0","ADD_ID","type_a=f32,type_b=f32,n_embd=32,n_experts=4,n_experts_used=1,n_token=32","support","1","yes","Vulkan" "Vulkan0","ADD_ID","type_a=f32,type_b=f32,n_embd=32,n_experts=4,n_experts_used=1,n_token=129","support","1","yes","Vulkan" @@ -9316,6 +10065,11 @@ "Vulkan0","ROPE","type=f32,ne_a=[128,16,2,1],n_dims=128,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[128,40,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[128,52,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","Vulkan" @@ -9370,126 +10124,281 @@ "Vulkan0","ROPE","type=f32,ne_a=[128,16,2,1],n_dims=128,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[128,40,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[128,52,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","Vulkan" @@ -9544,6 +10453,11 @@ "Vulkan0","ROPE_BACK","type=f32,ne_a=[128,16,2,1],n_dims=128,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[128,40,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[128,52,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","Vulkan" @@ -9598,126 +10512,281 @@ "Vulkan0","ROPE_BACK","type=f32,ne_a=[128,16,2,1],n_dims=128,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[16,16,8192,1],n_dims=16,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.000000,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.000000,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.000000,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=0,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.746500,af=1.000000,ff=1,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f32,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f32,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=0,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE_BACK","type=f16,ne_a=[64,128,2,1],n_dims=64,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=1,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=2,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=8,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=40,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" +"Vulkan0","ROPE_BACK","type=f16,ne_a=[36,16,2457,1],n_dims=36,mode=24,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=1,v=2,inplace=0","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=0,inplace=1","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[128,32,2,1],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=1,inplace=1","support","1","yes","Vulkan" "Vulkan0","ROPE","type=f32,ne_a=[128,32,2,3],n_dims=128,mode=0,n_ctx=512,fs=1.424500,ef=0.746500,af=1.424500,ff=0,v=1,inplace=1","support","1","yes","Vulkan" @@ -9853,10 +10922,12 @@ "Vulkan0","ARGSORT","type=f32,ne=[1023,2,1,3],order=0","support","1","yes","Vulkan" "Vulkan0","ARGSORT","type=f32,ne=[1024,2,1,3],order=0","support","1","yes","Vulkan" "Vulkan0","ARGSORT","type=f32,ne=[1025,2,1,3],order=0","support","1","yes","Vulkan" +"Vulkan0","ARGSORT","type=f32,ne=[1025,256,1,1],order=0","support","1","yes","Vulkan" "Vulkan0","ARGSORT","type=f32,ne=[2047,2,1,3],order=0","support","1","yes","Vulkan" "Vulkan0","ARGSORT","type=f32,ne=[2048,2,1,3],order=0","support","1","yes","Vulkan" "Vulkan0","ARGSORT","type=f32,ne=[2049,2,1,3],order=0","support","1","yes","Vulkan" "Vulkan0","ARGSORT","type=f32,ne=[2,8,8192,1],order=0","support","1","yes","Vulkan" +"Vulkan0","ARGSORT","type=f32,ne=[2048,512,1,1],order=0","support","1","yes","Vulkan" "Vulkan0","ARGSORT","type=f32,ne=[3,1,1,1],order=0","support","1","yes","Vulkan" "Vulkan0","ARGSORT","type=f32,ne=[4,1,1,1],order=0","support","1","yes","Vulkan" "Vulkan0","ARGSORT","type=f32,ne=[7,1,1,1],order=0","support","1","yes","Vulkan" @@ -9900,10 +10971,12 @@ "Vulkan0","ARGSORT","type=f32,ne=[1023,2,1,3],order=1","support","1","yes","Vulkan" "Vulkan0","ARGSORT","type=f32,ne=[1024,2,1,3],order=1","support","1","yes","Vulkan" "Vulkan0","ARGSORT","type=f32,ne=[1025,2,1,3],order=1","support","1","yes","Vulkan" +"Vulkan0","ARGSORT","type=f32,ne=[1025,256,1,1],order=1","support","1","yes","Vulkan" "Vulkan0","ARGSORT","type=f32,ne=[2047,2,1,3],order=1","support","1","yes","Vulkan" "Vulkan0","ARGSORT","type=f32,ne=[2048,2,1,3],order=1","support","1","yes","Vulkan" "Vulkan0","ARGSORT","type=f32,ne=[2049,2,1,3],order=1","support","1","yes","Vulkan" "Vulkan0","ARGSORT","type=f32,ne=[2,8,8192,1],order=1","support","1","yes","Vulkan" +"Vulkan0","ARGSORT","type=f32,ne=[2048,512,1,1],order=1","support","1","yes","Vulkan" "Vulkan0","TOP_K","type=f32,ne=[1,1,1,1],k=1,ties=0","support","1","yes","Vulkan" "Vulkan0","TOP_K","type=f32,ne=[12,1,2,1],k=1,ties=0","support","1","yes","Vulkan" "Vulkan0","TOP_K","type=f32,ne=[2,1,1,1],k=1,ties=0","support","1","yes","Vulkan" @@ -10267,6 +11340,19 @@ "Vulkan0","PAD","type=f32,ne_a=[512,512,1,1],pad_0=1,pad_1=1,circular=0","support","1","yes","Vulkan" "Vulkan0","PAD","type=f32,ne_a=[33,17,2,1],pad_0=4,pad_1=3,circular=1","support","1","yes","Vulkan" "Vulkan0","PAD","type=f32,ne_a=[512,512,3,1],lp0=1,rp0=1,lp1=1,rp1=1,lp2=1,rp2=1,lp3=1,rp3=1,tfrm=0,circular=0","support","1","yes","Vulkan" +"Vulkan0","PAD","type=f32,ne_a=[1024,1,1,1],pad_0=1,pad_1=0,circular=0","support","1","yes","Vulkan" +"Vulkan0","PAD","type=f32,ne_a=[1024,2,1,1],pad_0=1,pad_1=0,circular=0","support","1","yes","Vulkan" +"Vulkan0","PAD","type=f32,ne_a=[1024,16,1,1],pad_0=0,pad_1=1,circular=0","support","1","yes","Vulkan" +"Vulkan0","PAD","type=f32,ne_a=[1023,1,1,1],pad_0=1,pad_1=0,circular=0","support","1","yes","Vulkan" +"Vulkan0","PAD","type=f32,ne_a=[1023,8,1,1],pad_0=1,pad_1=0,circular=0","support","1","yes","Vulkan" +"Vulkan0","PAD","type=f32,ne_a=[1025,1,1,1],pad_0=1,pad_1=0,circular=0","support","1","yes","Vulkan" +"Vulkan0","PAD","type=f32,ne_a=[1025,8,1,1],pad_0=1,pad_1=0,circular=0","support","1","yes","Vulkan" +"Vulkan0","PAD","type=f32,ne_a=[2048,1,1,1],pad_0=1,pad_1=0,circular=0","support","1","yes","Vulkan" +"Vulkan0","PAD","type=f32,ne_a=[2048,4,1,1],pad_0=1,pad_1=0,circular=0","support","1","yes","Vulkan" +"Vulkan0","PAD","type=f32,ne_a=[2049,1,1,1],pad_0=1,pad_1=0,circular=0","support","1","yes","Vulkan" +"Vulkan0","PAD","type=f32,ne_a=[100,1,1,1],pad_0=100,pad_1=0,circular=0","support","1","yes","Vulkan" +"Vulkan0","PAD","type=f32,ne_a=[100,1,1,1],pad_0=0,pad_1=100,circular=0","support","1","yes","Vulkan" +"Vulkan0","PAD","type=f32,ne_a=[100,100,1,1],pad_0=50,pad_1=50,circular=0","support","1","yes","Vulkan" "Vulkan0","PAD_REFLECT_1D","type=f32,ne_a=[512,34,2,1],pad_0=10,pad_1=9","support","0","no","Vulkan" "Vulkan0","PAD_REFLECT_1D","type=f32,ne_a=[3000,384,4,1],pad_0=10,pad_1=9","support","0","no","Vulkan" "Vulkan0","ROLL","shift0=3,shift1=-2,shift3=1,shift4=-1","support","1","yes","Vulkan" @@ -10290,6 +11376,9 @@ "Vulkan0","CUMSUM","type=f32,ne=[375960,1,1,1]","support","1","yes","Vulkan" "Vulkan0","CUMSUM","type=f32,ne=[20481,4,1,1]","support","1","yes","Vulkan" "Vulkan0","XIELU","type=f32,ne=[10,5,4,3]","support","1","yes","Vulkan" +"Vulkan0","XIELU","type=f16,ne=[10,5,4,3]","support","1","yes","Vulkan" +"Vulkan0","XIELU","type=f32,ne=[512,16,1,1]","support","1","yes","Vulkan" +"Vulkan0","XIELU","type=f16,ne=[512,16,1,1]","support","1","yes","Vulkan" "Vulkan0","TRI","type=f32,ne=[10,10,4,3],tri_type=3","support","1","yes","Vulkan" "Vulkan0","TRI","type=f32,ne=[10,10,4,3],tri_type=2","support","1","yes","Vulkan" "Vulkan0","TRI","type=f32,ne=[10,10,4,3],tri_type=1","support","1","yes","Vulkan" @@ -10337,3270 +11426,5142 @@ "Vulkan0","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,tfrm=2,circular=0","support","1","yes","Vulkan" "Vulkan0","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,tfrm=2,circular=1","support","1","yes","Vulkan" "Vulkan0","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,tfrm=2,circular=1","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,2,1,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,2,1,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,2,1,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,2,1,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,2,1,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,2,1,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,2,1,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,2,1,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,2,1,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,2,1,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,2,1,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,2,1,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,2,1,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,2,1,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,2,1,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,2,1,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,2,1,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,2,1,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,2,1,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,2,1,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,2,1,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,2,1,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,2,1,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,2,1,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,2,1,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,2,1,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,2,1,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,2,1,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,2,1,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,2,1,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,2,1,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,2,1,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,2,1,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,2,1,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,2,1,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,2,1,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,2,1,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,2,1,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,2,1,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,2,1,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,2,1,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,2,1,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,2,1,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,2,1,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,2,1,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,2,1,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,2,1,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,2,1,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","0","no","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" -"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,3],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[4,3],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,2,1,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,2,1,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,2,1,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,2,1,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,2,1,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,2,1,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,2,1,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,2,1,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,2,1,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,2,1,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,2,1,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,2,1,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,2,1,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,2,1,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,2,1,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,2,1,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,2,1,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,2,1,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,2,1,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,2,1,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,2,1,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,2,1,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,2,1,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,2,1,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,2,1,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,2,1,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,2,1,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,2,1,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,2,1,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,2,1,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,2,1,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,2,1,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,3],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[4,3],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,2,1,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,2,1,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,2,1,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,2,1,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,2,1,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,2,1,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,2,1,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,2,1,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,2,1,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,2,1,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,2,1,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,2,1,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,2,1,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,2,1,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,2,1,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,2,1,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f32,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=bf16,type_V=bf16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_1,type_V=q5_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q5_0,type_V=q5_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_1,type_V=q4_1,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=iq4_nl,type_V=iq4_nl,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=80,hsv=80,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=96,hsv=96,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[12,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=10.000000,prec=def,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[8,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[8,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[8,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[8,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[8,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[8,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[8,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[8,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[16,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[16,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[16,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[16,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[16,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[16,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[16,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[16,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[8,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[8,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[8,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[8,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[16,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[16,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[16,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[16,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[8,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[8,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[8,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[8,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[8,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[8,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[8,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[8,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[16,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[16,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[16,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[16,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[16,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[16,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[16,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[16,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[8,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[8,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[8,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[8,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[16,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[16,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[16,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[16,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[8,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[8,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[8,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[8,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[16,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[16,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[16,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[16,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[8,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[8,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[8,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[8,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[16,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[16,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[16,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=128,nh=4,nr23=[16,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[8,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[8,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[8,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[8,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[8,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[8,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[8,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[8,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[16,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[16,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[16,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[16,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[16,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[16,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[16,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[16,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[8,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[8,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[8,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[8,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[16,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[16,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[16,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[16,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[8,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[8,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[8,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[8,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[8,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[8,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[8,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[8,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[16,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[16,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[16,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[16,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[16,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[16,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[16,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[16,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[8,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[8,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[8,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[8,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[16,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[16,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[16,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[16,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[8,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[8,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[8,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[8,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[16,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[16,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[16,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[16,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[8,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[8,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[8,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[8,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[16,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[16,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[16,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=192,hsv=192,nh=4,nr23=[16,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=256,hsv=256,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=1,nr23=[32,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=320,hsv=256,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=512,hsv=512,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=1,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,2,1,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=113,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=113,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=113,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=1,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=3,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=32,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=75,mask=1,sinks=0,max_bias=8.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=1,nr23=[20,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=113,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=113,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=113,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=113,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[1,1],kv=1024,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=3,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=32,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=576,hsv=512,nh=4,nr23=[4,1],kv=512,nb=75,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=128,nb=2,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q8_0,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=128,nb=2,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=f16,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=72,hsv=72,nh=4,nr23=[1,1],kv=96,nb=2,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q8_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=64,nh=4,nr23=[1,1],kv=96,nb=2,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=f32,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=256,nb=1,mask=0,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=f16,type_V=q4_0,permute=[0,1,2,3]","support","1","yes","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=128,nh=4,nr23=[1,1],kv=96,nb=2,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q1_0,type_V=q1_0,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=64,nh=4,nr23=[1,1],kv=128,nb=2,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q1_0,type_V=q4_0,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=64,hsv=128,nh=4,nr23=[1,1],kv=128,nb=2,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q4_0,type_V=q1_0,permute=[0,1,2,3]","support","0","no","Vulkan" +"Vulkan0","FLASH_ATTN_EXT","hsk=128,hsv=64,nh=4,nr23=[1,1],kv=64,nb=2,mask=1,sinks=0,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_K=q1_0,type_V=f16,permute=[0,1,2,3]","support","0","no","Vulkan" "Vulkan0","CROSS_ENTROPY_LOSS","type=f32,ne=[10,5,4,3]","support","0","no","Vulkan" "Vulkan0","CROSS_ENTROPY_LOSS","type=f32,ne=[30000,1,1,1]","support","0","no","Vulkan" "Vulkan0","CROSS_ENTROPY_LOSS_BACK","type=f32,ne=[10,5,4,3]","support","0","no","Vulkan" "Vulkan0","CROSS_ENTROPY_LOSS_BACK","type=f32,ne=[30000,1,1,1]","support","0","no","Vulkan" "Vulkan0","OPT_STEP_ADAMW","type=f32,ne=[10,5,4,3]","support","1","yes","Vulkan" "Vulkan0","OPT_STEP_SGD","type=f32,ne=[10,5,4,3]","support","1","yes","Vulkan" -"Vulkan0","GATED_DELTA_NET","type=f32,head_count=32,head_size=128,n_seq_tokens=1,n_seqs=1,v_repeat=1,permuted=0,kda=0","support","0","no","Vulkan" -"Vulkan0","GATED_DELTA_NET","type=f32,head_count=16,head_size=64,n_seq_tokens=1,n_seqs=2,v_repeat=1,permuted=0,kda=0","support","0","no","Vulkan" -"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=1,v_repeat=1,permuted=0,kda=0","support","0","no","Vulkan" -"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=2,v_repeat=1,permuted=0,kda=0","support","0","no","Vulkan" -"Vulkan0","GATED_DELTA_NET","type=f32,head_count=8,head_size=32,n_seq_tokens=4,n_seqs=2,v_repeat=2,permuted=0,kda=0","support","0","no","Vulkan" -"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=2,v_repeat=1,permuted=1,kda=0","support","0","no","Vulkan" -"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=1,v_repeat=1,permuted=1,kda=0","support","0","no","Vulkan" -"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=1,n_seqs=1,v_repeat=1,permuted=0,kda=1","support","0","no","Vulkan" -"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=1,n_seqs=2,v_repeat=1,permuted=0,kda=1","support","0","no","Vulkan" -"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=32,n_seq_tokens=4,n_seqs=1,v_repeat=1,permuted=0,kda=1","support","0","no","Vulkan" -"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=2,v_repeat=1,permuted=0,kda=1","support","0","no","Vulkan" -"Vulkan0","GATED_DELTA_NET","type=f32,head_count=8,head_size=32,n_seq_tokens=4,n_seqs=2,v_repeat=2,permuted=0,kda=1","support","0","no","Vulkan" -"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=2,v_repeat=1,permuted=1,kda=1","support","0","no","Vulkan" +"Vulkan0","GATED_DELTA_NET","type=f32,head_count=32,head_size=128,n_seq_tokens=1,n_seqs=1,v_repeat=1,permuted=0,kda=0,K=1","support","1","yes","Vulkan" +"Vulkan0","GATED_DELTA_NET","type=f32,head_count=32,head_size=16,n_seq_tokens=1,n_seqs=1,v_repeat=1,permuted=0,kda=0,K=1","support","0","no","Vulkan" +"Vulkan0","GATED_DELTA_NET","type=f32,head_count=32,head_size=16,n_seq_tokens=1,n_seqs=1,v_repeat=1,permuted=1,kda=1,K=1","support","0","no","Vulkan" +"Vulkan0","GATED_DELTA_NET","type=f32,head_count=32,head_size=16,n_seq_tokens=1,n_seqs=1,v_repeat=1,permuted=0,kda=1,K=1","support","0","no","Vulkan" +"Vulkan0","GATED_DELTA_NET","type=f32,head_count=16,head_size=64,n_seq_tokens=1,n_seqs=2,v_repeat=1,permuted=0,kda=0,K=1","support","1","yes","Vulkan" +"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=1,v_repeat=1,permuted=0,kda=0,K=1","support","1","yes","Vulkan" +"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=2,v_repeat=1,permuted=0,kda=0,K=1","support","1","yes","Vulkan" +"Vulkan0","GATED_DELTA_NET","type=f32,head_count=8,head_size=32,n_seq_tokens=4,n_seqs=2,v_repeat=2,permuted=0,kda=0,K=1","support","1","yes","Vulkan" +"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=2,v_repeat=1,permuted=1,kda=0,K=1","support","1","yes","Vulkan" +"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=1,v_repeat=1,permuted=1,kda=0,K=1","support","1","yes","Vulkan" +"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=1,n_seqs=1,v_repeat=1,permuted=0,kda=1,K=1","support","1","yes","Vulkan" +"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=1,n_seqs=2,v_repeat=1,permuted=0,kda=1,K=1","support","1","yes","Vulkan" +"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=16,n_seq_tokens=1,n_seqs=2,v_repeat=1,permuted=0,kda=1,K=1","support","0","no","Vulkan" +"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=32,n_seq_tokens=4,n_seqs=1,v_repeat=1,permuted=0,kda=1,K=1","support","1","yes","Vulkan" +"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=2,v_repeat=1,permuted=0,kda=1,K=1","support","1","yes","Vulkan" +"Vulkan0","GATED_DELTA_NET","type=f32,head_count=8,head_size=32,n_seq_tokens=4,n_seqs=2,v_repeat=2,permuted=0,kda=1,K=1","support","1","yes","Vulkan" +"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=2,v_repeat=1,permuted=1,kda=1,K=1","support","1","yes","Vulkan" +"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=16,n_seq_tokens=4,n_seqs=2,v_repeat=1,permuted=1,kda=1,K=1","support","0","no","Vulkan" +"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=64,n_seqs=1,v_repeat=1,permuted=0,kda=0,K=1","support","1","yes","Vulkan" +"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=127,n_seqs=1,v_repeat=1,permuted=0,kda=0,K=1","support","1","yes","Vulkan" +"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=256,n_seqs=1,v_repeat=1,permuted=0,kda=0,K=1","support","1","yes","Vulkan" +"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=65,n_seqs=1,v_repeat=1,permuted=0,kda=0,K=1","support","1","yes","Vulkan" +"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=100,n_seqs=1,v_repeat=1,permuted=0,kda=0,K=1","support","1","yes","Vulkan" +"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=200,n_seqs=1,v_repeat=1,permuted=0,kda=0,K=1","support","1","yes","Vulkan" +"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=127,n_seqs=2,v_repeat=1,permuted=0,kda=0,K=1","support","1","yes","Vulkan" +"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=64,n_seqs=1,v_repeat=1,permuted=0,kda=1,K=1","support","1","yes","Vulkan" +"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=33,n_seqs=1,v_repeat=1,permuted=0,kda=1,K=1","support","1","yes","Vulkan" +"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=100,n_seqs=1,v_repeat=1,permuted=0,kda=1,K=1","support","1","yes","Vulkan" +"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=16,n_seq_tokens=2,n_seqs=1,v_repeat=1,permuted=0,kda=0,K=2","support","0","no","Vulkan" +"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=32,n_seq_tokens=4,n_seqs=1,v_repeat=1,permuted=0,kda=0,K=4","support","1","yes","Vulkan" +"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=2,v_repeat=1,permuted=0,kda=0,K=4","support","1","yes","Vulkan" +"Vulkan0","GATED_DELTA_NET","type=f32,head_count=8,head_size=128,n_seq_tokens=4,n_seqs=1,v_repeat=1,permuted=0,kda=0,K=4","support","1","yes","Vulkan" +"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=4,n_seqs=2,v_repeat=1,permuted=0,kda=1,K=4","support","1","yes","Vulkan" +"Vulkan0","GATED_DELTA_NET","type=f32,head_count=8,head_size=32,n_seq_tokens=4,n_seqs=2,v_repeat=2,permuted=0,kda=1,K=4","support","1","yes","Vulkan" +"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=32,n_seq_tokens=8,n_seqs=1,v_repeat=1,permuted=0,kda=0,K=3","support","1","yes","Vulkan" +"Vulkan0","GATED_DELTA_NET","type=f32,head_count=4,head_size=64,n_seq_tokens=16,n_seqs=2,v_repeat=1,permuted=0,kda=0,K=4","support","1","yes","Vulkan" diff --git a/docs/preset.md b/docs/preset.md index d49fb0a1aeb3..85762a420b31 100644 --- a/docs/preset.md +++ b/docs/preset.md @@ -8,55 +8,53 @@ The INI preset feature, introduced in [PR#17859](https://github.com/ggml-org/lla When running multiple models on the server (router mode), INI preset files can be used to configure model-specific parameters. Please refer to the [server documentation](../tools/server/README.md) for more details. -### Using a Remote Preset +### Using a Hugging Face Preset -> [!NOTE] +> [!IMPORTANT] > -> This feature is currently only supported via the `-hf` option. +> Please only use presets that you can trust! Unknown presets may be unsafe -For GGUF models hosted on Hugging Face, you can include a `preset.ini` file in the root directory of the repository to define specific configurations for that model. +You can push your preset to Hugging Face Hub and share with other users by: +1. Creating an empty model repository on Hugging Face +2. Creating a `preset.ini` file in the root directory of the repository -Example: +Example of a `preset.ini`: ```ini -hf-repo-draft = username/my-draft-model-GGUF -temp = 0.5 -top-k = 20 -top-p = 0.95 -``` - -For security reasons, only certain options are allowed. Please refer to [preset.cpp](../common/preset.cpp) for the complete list of permitted options. - -Example usage: - -Assuming your repository `username/my-model-with-preset` contains a `preset.ini` with the configuration above: +[*] +ctx-size = 0 +mmap = 1 +kv-unified = 1 +parallel = 4 +spec-default = 1 + +[Qwen3.5-4B] +hf = unsloth/Qwen3.5-4B-GGUF:Q4_K_M +ctx-size = 262144 +batch-size = 2048 +ubatch-size = 2048 +top-p = 1.0 +top-k = 0 +min-p = 0.01 +temp = 1.0 -```sh -llama-cli -hf username/my-model-with-preset - -# This is equivalent to: -llama-cli -hf username/my-model-with-preset \ - --hf-repo-draft username/my-draft-model-GGUF \ - --temp 0.5 \ - --top-k 20 \ - --top-p 0.95 +[gpt-oss-120b-hf] +hf = ggml-org/gpt-oss-120b-GGUF +ctx-size = 262144 +batch-size = 2048 +ubatch-size = 2048 +top-p = 1.0 +top-k = 0 +min-p = 0.01 +temp = 1.0 +chat-template-kwargs = {"reasoning_effort": "high"} ``` -You can also override preset arguments by specifying them on the command line: +The preset will be loaded similarly to the `--models-preset` option. Therefore, you can also override certain params via CLI arguments: ```sh # Force temp = 0.1, overriding the preset value -llama-cli -hf username/my-model-with-preset --temp 0.1 -``` - -If you want to define multiple preset configurations for one or more GGUF models, you can create a blank HF repo for each preset. Each HF repo should contain a `preset.ini` file that references the actual model(s): - -```ini -hf-repo = user/my-model-main -hf-repo-draft = user/my-model-draft -temp = 0.8 -ctx-size = 1024 -; (and other configurations) +llama-cli -hf username/my-preset --temp 0.1 ``` ### Named presets diff --git a/docs/speculative.md b/docs/speculative.md index 43d181858912..8f91256c4a4d 100644 --- a/docs/speculative.md +++ b/docs/speculative.md @@ -13,6 +13,45 @@ The `llama-server` application supports several implementations of speculative d A much smaller model (called the _draft model_) generates drafts. A draft model is the most used approach in speculative decoding. +### EAGLE-3 (`draft-eagle3`) + +EAGLE-3 uses a small draft model that reads the target model's hidden states to predict the next tokens, so it +reaches higher acceptance than a standalone draft model of the same size. The draft is a one-layer transformer +trained for a specific target model; it shares the target model's tokenizer and, optionally, uses a reduced draft +vocabulary with its own `lm_head`, which is mapped back using a `d2t` table. + +Convert the EAGLE-3 checkpoint with `--target-model-dir` so it inherits the target's tokenizer and the layer +indices to read. Both the SpecForge `LlamaForCausalLMEagle3` and the vLLM/AngelSlim `Eagle3LlamaForCausalLM` +checkpoint formats are supported (for example [`AngelSlim/Qwen3-4B_eagle3`](https://huggingface.co/AngelSlim/Qwen3-4B_eagle3) +for `Qwen/Qwen3-4B`): + +```bash +python convert_hf_to_gguf.py AngelSlim/Qwen3-4B_eagle3 \ + --target-model-dir Qwen/Qwen3-4B --outtype bf16 --outfile Qwen3-4B-eagle3.gguf + +llama-server -m Qwen3-4B.gguf -md Qwen3-4B-eagle3.gguf --spec-type draft-eagle3 +``` + +Supported EAGLE-3 draft models include: + +- [yuhuili/EAGLE3-LLaMA3.1-Instruct-8B](https://huggingface.co/yuhuili/EAGLE3-LLaMA3.1-Instruct-8B) +- [yuhuili/EAGLE3-LLaMA3.3-Instruct-70B](https://huggingface.co/yuhuili/EAGLE3-LLaMA3.3-Instruct-70B) +- [RedHatAI/gemma-4-31B-it-speculator.eagle3](https://huggingface.co/RedHatAI/gemma-4-31B-it-speculator.eagle3) +- [RedHatAI/gemma-4-26B-A4B-it-speculator.eagle3](https://huggingface.co/RedHatAI/gemma-4-26B-A4B-it-speculator.eagle3) +- [Tengyunw/qwen3_8b_eagle3](https://huggingface.co/Tengyunw/qwen3_8b_eagle3) +- [Tengyunw/qwen3_30b_moe_eagle3](https://huggingface.co/Tengyunw/qwen3_30b_moe_eagle3) +- [AngelSlim/Qwen3-1.7B_eagle3](https://huggingface.co/AngelSlim/Qwen3-1.7B_eagle3) +- [AngelSlim/Qwen3-4B_eagle3](https://huggingface.co/AngelSlim/Qwen3-4B_eagle3) +- [AngelSlim/Qwen3-8B_eagle3](https://huggingface.co/AngelSlim/Qwen3-8B_eagle3) +- [AngelSlim/Qwen3-14B_eagle3](https://huggingface.co/AngelSlim/Qwen3-14B_eagle3) +- [AngelSlim/Qwen3-32B_eagle3](https://huggingface.co/AngelSlim/Qwen3-32B_eagle3) +- [AngelSlim/Qwen3-a3B_eagle3](https://huggingface.co/AngelSlim/Qwen3-a3B_eagle3) +- [RedHatAI/gpt-oss-20b-speculator.eagle3](https://huggingface.co/RedHatAI/gpt-oss-20b-speculator.eagle3) +- [lmsys/EAGLE3-gpt-oss-120b-bf16](https://huggingface.co/lmsys/EAGLE3-gpt-oss-120b-bf16) +- [nvidia/gpt-oss-120b-Eagle3-long-context](https://huggingface.co/nvidia/gpt-oss-120b-Eagle3-long-context) + +For the full and up-to-date list of supported models, see #18039. + ### n-gram Cache (`ngram-cache`) An n-gram is a sequence of n tokens. The n-gram cache implementation maintains statistics about short n-gram sequences. @@ -108,7 +147,7 @@ If a draft model is combined with a draftless decoding the draftless decoding ha ### General Speculative Parameters ``` ---spec-type [none|draft-simple|draft-mtp|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod] +--spec-type [none|draft-simple|draft-eagle3|draft-mtp|ngram-cache|ngram-simple|ngram-map-k|ngram-map-k4v|ngram-mod] comma-separated list of types of speculative decoding to use (default: none) (env: LLAMA_ARG_SPEC_TYPE) @@ -247,6 +286,7 @@ Specifies a comma-separated list of speculative decoding types to use. |------|-------------| | `none` | No speculative decoding (default) | | `draft-simple` | Use a simple draft model for speculation | +| `draft-eagle3` | Use an EAGLE-3 draft model that reads the target's hidden states | | `draft-mtp` | Use Multi Token Prediction (MTP) heads from the main model | | `ngram-cache` | Use n-gram cache lookup | | `ngram-simple` | Use simple n-gram pattern matching | diff --git a/examples/json_schema_to_grammar.py b/examples/json_schema_to_grammar.py index 077fcfacac2e..83abd259da57 100755 --- a/examples/json_schema_to_grammar.py +++ b/examples/json_schema_to_grammar.py @@ -198,18 +198,18 @@ def __init__(self, content: str, deps: list | None = None): SPACE_RULE = '| " " | "\\n"{1,2} [ \\t]{0,20}' PRIMITIVE_RULES = { - 'boolean' : BuiltinRule('("true" | "false") space', []), + 'boolean' : BuiltinRule('("true" | "false")', []), 'decimal-part' : BuiltinRule('[0-9]{1,16}', []), 'integral-part': BuiltinRule('[0] | [1-9] [0-9]{0,15}', []), - 'number' : BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']), - 'integer' : BuiltinRule('("-"? integral-part) space', ['integral-part']), + 'number' : BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)?', ['integral-part', 'decimal-part']), + 'integer' : BuiltinRule('("-"? integral-part)', ['integral-part']), 'value' : BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']), - 'object' : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']), - 'array' : BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']), - 'uuid' : BuiltinRule(r'"\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\"" space', []), + 'object' : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? space "}"', ['string', 'value']), + 'array' : BuiltinRule('"[" space ( value ("," space value)* )? space "]"', ['value']), + 'uuid' : BuiltinRule(r'"\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\""', []), 'char' : BuiltinRule(r'[^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})', []), - 'string' : BuiltinRule(r'"\"" char* "\"" space', ['char']), - 'null' : BuiltinRule('"null" space', []), + 'string' : BuiltinRule(r'"\"" char* "\""', ['char']), + 'null' : BuiltinRule('"null"', []), } # TODO: support "uri", "email" string formats @@ -217,9 +217,9 @@ def __init__(self, content: str, deps: list | None = None): 'date' : BuiltinRule('[0-9]{4} "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []), 'time' : BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9]{3} )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []), 'date-time' : BuiltinRule('date "T" time', ['date', 'time']), - 'date-string' : BuiltinRule('"\\"" date "\\"" space', ['date']), - 'time-string' : BuiltinRule('"\\"" time "\\"" space', ['time']), - 'date-time-string': BuiltinRule('"\\"" date-time "\\"" space', ['date-time']), + 'date-string' : BuiltinRule('"\\"" date "\\""', ['date']), + 'time-string' : BuiltinRule('"\\"" time "\\""', ['time']), + 'date-time-string': BuiltinRule('"\\"" date-time "\\""', ['date-time']), } DOTALL = '[\\U00000000-\\U0010FFFF]' @@ -319,7 +319,7 @@ def visit(node): out.append(f'[^"{"".join(rejects)}] {char_rule}*') visit(trie) - out.append(f' ){"" if trie.is_end_of_string else "?"} ["] space') + out.append(f' ){"" if trie.is_end_of_string else "?"} ["]') return ''.join(out) def _add_rule(self, name, rule): @@ -549,7 +549,7 @@ def join_seq(): return self._add_rule( name, to_rule(transform()) if self._raw_pattern \ - else "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\" space") + else "\"\\\"\" (" + to_rule(transform()) + ") \"\\\"\"") def _resolve_ref(self, ref): @@ -580,10 +580,10 @@ def visit(self, schema, name): return self._add_rule(rule_name, self._generate_union_rule(name, [{**schema, 'type': t} for t in schema_type])) elif 'const' in schema: - return self._add_rule(rule_name, self._generate_constant_rule(schema['const']) + ' space') + return self._add_rule(rule_name, self._generate_constant_rule(schema['const'])) elif 'enum' in schema: - rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in schema['enum'])) + ') space' + rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in schema['enum'])) + ')' return self._add_rule(rule_name, rule) elif schema_type in (None, 'object') and \ @@ -624,7 +624,7 @@ def add_component(comp_schema, is_required): enum_intersection &= s if enum_intersection: - rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in sorted(enum_intersection))) + ') space' + rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in sorted(enum_intersection))) + ')' return self._add_rule(rule_name, rule) return self._add_rule(rule_name, self._build_object_rule(properties, required, hybrid_name, additional_properties=None)) @@ -638,12 +638,12 @@ def add_component(comp_schema, is_required): ' "," space '.join( self.visit(item, f'{name}{"-" if name else ""}tuple-{i}') for i, item in enumerate(items)) + - ' "]" space') + ' space "]"') else: item_rule_name = self.visit(items, f'{name}{"-" if name else ""}item') min_items = schema.get("minItems", 0) max_items = schema.get("maxItems") - return self._add_rule(rule_name, '"[" space ' + _build_repetition(item_rule_name, min_items, max_items, separator_rule='"," space') + ' "]" space') + return self._add_rule(rule_name, '"[" space ' + _build_repetition(item_rule_name, min_items, max_items, separator_rule='"," space') + ' space "]"') elif schema_type in (None, 'string') and 'pattern' in schema: return self._visit_pattern(schema['pattern'], rule_name) @@ -663,7 +663,7 @@ def add_component(comp_schema, is_required): min_len = schema.get('minLength', 0) max_len = schema.get('maxLength') - return self._add_rule(rule_name, r'"\"" ' + _build_repetition(char_rule, min_len, max_len) + r' "\"" space') + return self._add_rule(rule_name, r'"\"" ' + _build_repetition(char_rule, min_len, max_len) + r' "\""') elif schema_type in (None, 'integer') and \ ('minimum' in schema or 'exclusiveMinimum' in schema or 'maximum' in schema or 'exclusiveMaximum' in schema): @@ -680,7 +680,7 @@ def add_component(comp_schema, is_required): out = ["("] _generate_min_max_int(min_value, max_value, out) - out.append(") space") + out.append(")") return self._add_rule(rule_name, ''.join(out)) elif (schema_type == 'object') or (len(schema) == 0): @@ -765,7 +765,7 @@ def get_recursive_refs(ks, first_is_optional): rule += ' )' rule += ' )?' - rule += ' "}" space' + rule += ' space "}"' return rule diff --git a/examples/sycl/build.sh b/examples/sycl/build.sh index bf7d6b53bf23..9dd66cb67655 100755 --- a/examples/sycl/build.sh +++ b/examples/sycl/build.sh @@ -3,15 +3,45 @@ # Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: MIT +print_usage() { + echo "Usage: ./build.sh [fp32|fp16] [--help]" + echo "" + echo "Options:" + echo " fp32 Build with FP32 precision (default)" + echo " fp16 Build with FP16 precision (faster for long-prompt inference)" + echo " --help Print this help message" +} + +PRECISION=fp32 + +for arg in "$@"; do + case "$arg" in + --help) + print_usage + exit 0 + ;; + fp32|fp16) + PRECISION="$arg" + ;; + *) + echo "Error: unknown option '$arg'" + print_usage + exit 1 + ;; + esac +done + mkdir -p build cd build source /opt/intel/oneapi/setvars.sh -#for FP16 -#cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DLLAMA_OPENSSL=OFF # faster for long-prompt inference - -#for FP32 -cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_OPENSSL=OFF +if [ "$PRECISION" = "fp16" ]; then + #for FP16 + cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON -DLLAMA_OPENSSL=OFF # faster for long-prompt inference +else + #for FP32 + cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_OPENSSL=OFF +fi #build example/main #cmake --build . --config Release --target main diff --git a/examples/sycl/update-ops-doc.sh b/examples/sycl/update-ops-doc.sh new file mode 100755 index 000000000000..6f26fc4574bb --- /dev/null +++ b/examples/sycl/update-ops-doc.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +# MIT license +# Copyright (C) 2026 Intel Corporation +# SPDX-License-Identifier: MIT + +./build/bin/test-backend-ops support --output csv > docs/ops/SYCL.csv +./scripts/create_ops_docs.py + diff --git a/examples/sycl/win-build-sycl.bat b/examples/sycl/win-build-sycl.bat index fc8b33bbc269..9a82edbefe62 100644 --- a/examples/sycl/win-build-sycl.bat +++ b/examples/sycl/win-build-sycl.bat @@ -3,6 +3,23 @@ :: Copyright (C) 2024 Intel Corporation :: SPDX-License-Identifier: MIT +IF /I "%1"=="--help" ( + echo Usage: win-build-sycl.bat [fp32^|fp16] [--help] + echo. + echo Options: + echo fp32 Build with FP32 precision ^(default^) + echo fp16 Build with FP16 precision ^(faster for long-prompt inference^) + echo --help Print this help message + exit /B 0 +) + +SET PRECISION=%1 +IF "%PRECISION%"=="" SET PRECISION=fp32 +IF /I NOT "%PRECISION%"=="fp32" IF /I NOT "%PRECISION%"=="fp16" ( + echo Error: invalid value '%PRECISION%'. Use 'fp32' or 'fp16'. + echo Usage: win-build-sycl.bat [fp32^|fp16] [--help] + exit /B 1 +) IF not exist build (mkdir build) cd build @@ -11,12 +28,14 @@ if %errorlevel% neq 0 goto ERROR @call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force if %errorlevel% neq 0 goto ERROR -:: for FP16 -:: faster for long-prompt inference -:: cmake -G "MinGW Makefiles" .. -DLLAMA_OPENSSL=OFF -DGGML_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON - -:: for FP32 -cmake -G "Ninja" .. -DLLAMA_OPENSSL=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release +IF /I "%PRECISION%"=="fp16" ( + :: for FP16 + :: faster for long-prompt inference + cmake -G "MinGW Makefiles" .. -DLLAMA_OPENSSL=OFF -DGGML_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON +) ELSE ( + :: for FP32 + cmake -G "Ninja" .. -DLLAMA_OPENSSL=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release +) if %errorlevel% neq 0 goto ERROR :: build all binary diff --git a/examples/sycl/win-update-ops-doc.bat b/examples/sycl/win-update-ops-doc.bat new file mode 100644 index 000000000000..b032bcfe1ef9 --- /dev/null +++ b/examples/sycl/win-update-ops-doc.bat @@ -0,0 +1,8 @@ +@echo off + +rem MIT license +rem Copyright (C) 2026 Intel Corporation +rem SPDX-License-Identifier: MIT + +build\bin\test-backend-ops support --output csv > docs\ops\SYCL.csv +python scripts\create_ops_docs.py diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 8f7cb8cdfd20..0ec62a3773d6 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -4,8 +4,8 @@ project("ggml" C CXX ASM) ### GGML Version set(GGML_VERSION_MAJOR 0) -set(GGML_VERSION_MINOR 14) -set(GGML_VERSION_PATCH 0) +set(GGML_VERSION_MINOR 15) +set(GGML_VERSION_PATCH 3) set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}") list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/") @@ -249,7 +249,7 @@ option(GGML_SYCL "ggml: use SYCL" option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF) option(GGML_SYCL_GRAPH "ggml: enable graphs in the SYCL backend" ON) option(GGML_SYCL_HOST_MEM_FALLBACK "ggml: allow host memory fallback in SYCL reorder (requires kernel 6.8+)" ON) -option(GGML_SYCL_SUPPORT_LEVEL_ZERO "ggml: use Level Zero API in SYCL backend" ON) +option(GGML_SYCL_SUPPORT_LEVEL_ZERO_API "ggml: use Level Zero API in SYCL backend" ON) option(GGML_SYCL_DNN "ggml: enable oneDNN in the SYCL backend" ON) set (GGML_SYCL_TARGET "INTEL" CACHE STRING "ggml: sycl target device") @@ -266,7 +266,6 @@ set (GGML_OPENCL_TARGET_VERSION "300" CACHE STRING "ggml: OpenCL API version to target") option(GGML_HEXAGON "ggml: enable Hexagon backend" OFF) -set(GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE 128 CACHE STRING "ggml: quantize group size (32, 64, or 128)") # toolchain for vulkan-shaders-gen set (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file for vulkan-shaders-gen") diff --git a/ggml/include/ggml-sycl.h b/ggml/include/ggml-sycl.h index 5ce349a880ed..418a7ba978b4 100644 --- a/ggml/include/ggml-sycl.h +++ b/ggml/include/ggml-sycl.h @@ -27,6 +27,14 @@ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int de // split tensor buffer that splits matrices by rows across multiple devices GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split); +// Tensor parallelism (--split-mode tensor): comm_init/free/allreduce_tensor +// trio queried by the meta-backend via ggml_backend_reg_get_proc_address. +// See typedefs in ggml/include/ggml-backend.h. Mirrors the CUDA backend's +// pattern (ggml_backend_cuda_comm_*). +GGML_BACKEND_API void * ggml_backend_sycl_comm_init(ggml_backend_t * backends, size_t n_backends); +GGML_BACKEND_API void ggml_backend_sycl_comm_free(void * comm_ctx); +GGML_BACKEND_API bool ggml_backend_sycl_comm_allreduce_tensor(void * comm_ctx, struct ggml_tensor ** tensors); + // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void); diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index d6807b6dd47a..ac133665d978 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -429,7 +429,8 @@ extern "C" { GGML_TYPE_MXFP4 = 39, // MXFP4 (1 block) GGML_TYPE_NVFP4 = 40, // NVFP4 (4 blocks, E4M3 scale) GGML_TYPE_Q1_0 = 41, - GGML_TYPE_COUNT = 42, + GGML_TYPE_Q2_0 = 42, + GGML_TYPE_COUNT = 43, }; // precision @@ -473,6 +474,7 @@ extern "C" { GGML_FTYPE_MOSTLY_MXFP4 = 25, // except 1d tensors GGML_FTYPE_MOSTLY_NVFP4 = 26, // except 1d tensors GGML_FTYPE_MOSTLY_Q1_0 = 27, // except 1d tensors + GGML_FTYPE_MOSTLY_Q2_0 = 28, // except 1d tensors }; // available tensor operations: diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index c26c3f1470d8..89e5180d931f 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -438,7 +438,14 @@ if (GGML_CPU_ALL_VARIANTS) ggml_add_cpu_backend_variant(power8_2 POWER8 VSX) ggml_add_cpu_backend_variant(power9 POWER9 VSX) ggml_add_cpu_backend_variant(power10 POWER10 VSX) - ggml_add_cpu_backend_variant(power11 POWER11 VSX) + # POWER11 backend: only if compiler supports -mcpu=power11 + check_cxx_compiler_flag("-mcpu=power11" GGML_CXX_SUPPORTS_POWER11) + if (GGML_CXX_SUPPORTS_POWER11) + message(STATUS "Compiler supports -mcpu=power11, enabling POWER11 backend") + ggml_add_cpu_backend_variant(power11 POWER11 VSX) + else() + message(STATUS "Skipping POWER11 backend: compiler does not support -mcpu=power11") + endif() else() message(FATAL_ERROR "Unsupported PowerPC target OS: ${CMAKE_SYSTEM_NAME}") endif() diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index 87615921c09b..644130cd134d 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -1551,6 +1551,8 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s int split_backend_id = split->backend_id; ggml_backend_t split_backend = sched->backends[split_backend_id]; + ggml_backend_synchronize(split_backend); + // copy the input tensors to the split backend for (int input_id = 0; input_id < split->n_inputs; input_id++) { ggml_backend_t input_backend = ggml_backend_sched_get_tensor_backend(sched, split->inputs[input_id]); @@ -1561,15 +1563,15 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s // inputs from the user must be copied immediately to prevent the user overwriting the data before the copy is done if (sched->events[split_backend_id][sched->cur_copy] != NULL) { ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]); - } else { + } else if (!split_backend->iface.cpy_tensor_async) { ggml_backend_synchronize(split_backend); } - ggml_backend_tensor_copy(input, input_cpy); + ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy); } else { // wait for the split backend to finish using the input before overwriting it if (sched->events[split_backend_id][sched->cur_copy] != NULL) { ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]); - } else { + } else if (!split_backend->iface.cpy_tensor_async) { ggml_backend_synchronize(split_backend); } @@ -1674,6 +1676,8 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s } } + ggml_backend_synchronize(split_backend); + if (!sched->callback_eval) { enum ggml_status ec = ggml_backend_graph_compute_async(split_backend, &split->graph); if (ec != GGML_STATUS_SUCCESS) { diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h index f05683b44cd9..29028b32a2fd 100644 --- a/ggml/src/ggml-common.h +++ b/ggml/src/ggml-common.h @@ -96,6 +96,9 @@ typedef sycl::half2 ggml_half2; #define QI1_0 (QK1_0 / 32) #define QR1_0 1 +#define QI2_0 (QK2_0 / 32) +#define QR2_0 1 + #define QI4_0 (QK4_0 / (4 * QR4_0)) #define QR4_0 2 @@ -181,6 +184,13 @@ typedef struct { } block_q1_0; static_assert(sizeof(block_q1_0) == sizeof(ggml_half) + QK1_0 / 8, "wrong q1_0 block size/padding"); +#define QK2_0 64 +typedef struct { + ggml_half d; // delta (scale) + uint8_t qs[QK2_0 / 4]; // 2 bits per element +} block_q2_0; +static_assert(sizeof(block_q2_0) == sizeof(ggml_half) + QK2_0 / 4, "wrong q2_0 block size/padding"); + #define QK4_0 32 typedef struct { ggml_half d; // delta diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt index 8c735a045b35..f7c557af4c0c 100644 --- a/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ggml/src/ggml-cpu/CMakeLists.txt @@ -389,7 +389,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name) string(REGEX MATCHALL "POWER *([0-9]+)" MATCHED_STRING "${POWER10_M_UPPER}") string(REGEX REPLACE "POWER *([0-9]+)" "\\1" EXTRACTED_NUMBER "${MATCHED_STRING}") - if (EXTRACTED_NUMBER GREATER_EQUAL 10) + if (EXTRACTED_NUMBER EQUAL 10 OR EXTRACTED_NUMBER EQUAL 11) list(APPEND ARCH_FLAGS -mcpu=power10) elseif (EXTRACTED_NUMBER EQUAL 9) list(APPEND ARCH_FLAGS -mcpu=power9) diff --git a/ggml/src/ggml-cpu/amx/mmq.cpp b/ggml/src/ggml-cpu/amx/mmq.cpp index d9383a04be83..9f3a744b5de3 100644 --- a/ggml/src/ggml-cpu/amx/mmq.cpp +++ b/ggml/src/ggml-cpu/amx/mmq.cpp @@ -2417,15 +2417,14 @@ void ggml_backend_amx_mul_mat(const ggml_compute_params * params, struct ggml_te // Q4_K, Q5_K, Q6_K, IQ4_XS handles 8 TILE_K per blck_size GGML_ASSERT(TILE_K == blck_size || TILE_K * 8 == blck_size); - parallel_for_ggml(params, n_batch, [&](int begin, int end) { - for (int batch_idx = begin; batch_idx < end; ++batch_idx) { + parallel_for_ggml(params, n_batch * M, [&](int begin, int end) { + for (int idx = begin; idx < end; ++idx) { + int batch_idx = idx / M; + int m = idx % M; int64_t src1_offset = ggml_batch_offset(src1, batch_idx, ne2); const float * A_data = (const float *)((const char *)src1->data + src1_offset); char * wdata_batch = (char *)wdata + batch_idx * M * row_size_A; - - for (int m = 0; m < M; ++m) { - from_float(A_data + m * K, wdata_batch + m * row_size_A, K); - } + from_float(A_data + m * K, wdata_batch + m * row_size_A, K); } }); }); diff --git a/ggml/src/ggml-cpu/arch-fallback.h b/ggml/src/ggml-cpu/arch-fallback.h index b0391a67c88d..226346c69cf9 100644 --- a/ggml/src/ggml-cpu/arch-fallback.h +++ b/ggml/src/ggml-cpu/arch-fallback.h @@ -17,6 +17,7 @@ #define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0 #define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0 #define ggml_vec_dot_q1_0_q8_0_generic ggml_vec_dot_q1_0_q8_0 +#define ggml_vec_dot_q2_0_q8_0_generic ggml_vec_dot_q2_0_q8_0 #define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K #define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K #define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K @@ -83,6 +84,7 @@ #elif defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64) // quants.c #define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0 +#define ggml_vec_dot_q2_0_q8_0_generic ggml_vec_dot_q2_0_q8_0 // repack.cpp #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4 #define ggml_quantize_mat_q8_K_4x4_generic ggml_quantize_mat_q8_K_4x4 @@ -114,6 +116,7 @@ #define quantize_row_q8_K_generic quantize_row_q8_K #define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0 #define ggml_vec_dot_q1_0_q8_0_generic ggml_vec_dot_q1_0_q8_0 +#define ggml_vec_dot_q2_0_q8_0_generic ggml_vec_dot_q2_0_q8_0 #define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K #define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K #define ggml_vec_dot_iq1_m_q8_K_generic ggml_vec_dot_iq1_m_q8_K @@ -163,6 +166,7 @@ #define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0 #define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0 #define ggml_vec_dot_q1_0_q8_0_generic ggml_vec_dot_q1_0_q8_0 +#define ggml_vec_dot_q2_0_q8_0_generic ggml_vec_dot_q2_0_q8_0 // repack.cpp #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4 #define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8 @@ -203,6 +207,7 @@ #elif defined(__riscv) // quants.c #define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0 +#define ggml_vec_dot_q2_0_q8_0_generic ggml_vec_dot_q2_0_q8_0 // repack.cpp #define ggml_quantize_mat_q8_0_4x1_generic ggml_quantize_mat_q8_0_4x1 #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4 @@ -244,6 +249,7 @@ #define quantize_row_q8_K_generic quantize_row_q8_K #define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0 #define ggml_vec_dot_q1_0_q8_0_generic ggml_vec_dot_q1_0_q8_0 +#define ggml_vec_dot_q2_0_q8_0_generic ggml_vec_dot_q2_0_q8_0 #define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K #define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K #define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K @@ -293,7 +299,6 @@ #define ggml_gemm_q8_0_4x8_q8_0_generic ggml_gemm_q8_0_4x8_q8_0 #elif defined(__wasm__) // quants.c -#define ggml_vec_dot_q4_1_q8_1_generic ggml_vec_dot_q4_1_q8_1 #define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K #define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K #define ggml_vec_dot_iq2_xxs_q8_K_generic ggml_vec_dot_iq2_xxs_q8_K @@ -308,6 +313,7 @@ #define ggml_vec_dot_mxfp4_q8_0_generic ggml_vec_dot_mxfp4_q8_0 #define ggml_vec_dot_nvfp4_q8_0_generic ggml_vec_dot_nvfp4_q8_0 #define ggml_vec_dot_q1_0_q8_0_generic ggml_vec_dot_q1_0_q8_0 +#define ggml_vec_dot_q2_0_q8_0_generic ggml_vec_dot_q2_0_q8_0 // repack.cpp #define ggml_quantize_mat_q8_0_4x4_generic ggml_quantize_mat_q8_0_4x4 #define ggml_quantize_mat_q8_0_4x8_generic ggml_quantize_mat_q8_0_4x8 diff --git a/ggml/src/ggml-cpu/arch/arm/quants.c b/ggml/src/ggml-cpu/arch/arm/quants.c index fe6213329708..9faa4a014193 100644 --- a/ggml/src/ggml-cpu/arch/arm/quants.c +++ b/ggml/src/ggml-cpu/arch/arm/quants.c @@ -219,6 +219,80 @@ void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi #endif } +void ggml_vec_dot_q2_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + const int qk = QK2_0; + const int nb = n / qk; + + assert(n % qk == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q2_0 * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; + + float sumf = 0.0f; + +#if defined(__ARM_NEON) + // Replicate pattern: each byte repeated 4 times + static const uint8_t tbl_idx_lo[16] = {0,0,0,0, 1,1,1,1, 2,2,2,2, 3,3,3,3}; + static const uint8_t tbl_idx_hi[16] = {4,4,4,4, 5,5,5,5, 6,6,6,6, 7,7,7,7}; + // Right-shift amounts: 0,2,4,6 repeated for each group of 4 + static const int8_t shift_vals[16] = {0,-2,-4,-6, 0,-2,-4,-6, 0,-2,-4,-6, 0,-2,-4,-6}; + + const uint8x16_t idx_lo = vld1q_u8(tbl_idx_lo); + const uint8x16_t idx_hi = vld1q_u8(tbl_idx_hi); + const int8x16_t shifts = vld1q_s8(shift_vals); + const uint8x16_t mask2 = vdupq_n_u8(0x03); + const int8x16_t one = vdupq_n_s8(1); + + float32x4_t sumv = vdupq_n_f32(0.0f); + + for (int i = 0; i < nb; i++) { + const float d0 = GGML_CPU_FP16_TO_FP32(x[i].d); + + // group 64: one Q2_0 block (64 weights) maps to two Q8_0 blocks (2 * 32 = 64) + for (int k = 0; k < 2; k++) { + const block_q8_0 * GGML_RESTRICT yb = &y[i * 2 + k]; + const float d1 = GGML_CPU_FP16_TO_FP32(yb->d); + + // Load 8 bytes of packed 2-bit values + const uint8x8_t raw = vld1_u8(&x[i].qs[k * 8]); + const uint8x16_t raw16 = vcombine_u8(raw, raw); + + // First 16 elements: replicate bytes 0-3, shift, mask, subtract 1 + uint8x16_t bytes0 = vqtbl1q_u8(raw16, idx_lo); + int8x16_t qv0 = vsubq_s8( + vreinterpretq_s8_u8(vandq_u8(vshlq_u8(bytes0, shifts), mask2)), + one); + + // Second 16 elements: replicate bytes 4-7, shift, mask, subtract 1 + uint8x16_t bytes1 = vqtbl1q_u8(raw16, idx_hi); + int8x16_t qv1 = vsubq_s8( + vreinterpretq_s8_u8(vandq_u8(vshlq_u8(bytes1, shifts), mask2)), + one); + + // Load Q8_0 values and dot product + const int8x16_t y0 = vld1q_s8(yb->qs); + const int8x16_t y1 = vld1q_s8(yb->qs + 16); + + int32x4_t p0 = ggml_vdotq_s32(vdupq_n_s32(0), qv0, y0); + int32x4_t p1 = ggml_vdotq_s32(p0, qv1, y1); + + sumv = vmlaq_n_f32(sumv, vcvtq_f32_s32(p1), d0 * d1); + } + } + + sumf = vaddvq_f32(sumv); +#else + ggml_vec_dot_q2_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); + return; +#endif + + *s = sumf; +} void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { const int qk = QK8_0; diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index eb8341c9aecc..15c31fa01ec3 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -227,6 +227,12 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_0, .nrows = 1, }, + [GGML_TYPE_Q2_0] = { + .from_float = quantize_row_q2_0, + .vec_dot = ggml_vec_dot_q2_0_q8_0, + .vec_dot_type = GGML_TYPE_Q8_0, + .nrows = 1, + }, [GGML_TYPE_Q4_0] = { .from_float = quantize_row_q4_0, .vec_dot = ggml_vec_dot_q4_0_q8_0, diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.cpp b/ggml/src/ggml-cpu/llamafile/sgemm.cpp index e13828e3be6f..0b8323e60ce6 100644 --- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp +++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp @@ -2345,7 +2345,7 @@ class tinyBLAS_Q0_PPC { else if (n_aligned % 16 == 0) nc = 16; else nc = 8; } - bool can_use_tiled = n_aligned > 0 && (m % mc == 0) && (k % kc == 0); + bool can_use_tiled = n_aligned > 0 && (m % mc == 0); if (can_use_tiled) { matmul_tiled(m, n_aligned, mc, nc, kc); if (n > n_aligned) { @@ -3063,13 +3063,14 @@ class tinyBLAS_Q0_PPC { int64_t ii = (job / xtiles) * mc; int64_t jj = (job % xtiles) * nc; for (int64_t kk = 0; kk < k; kk += kc) { + int64_t k_cur = MIN(kc, k - kk); if constexpr(is_Ablock_q4) { - packNormal_q4_fp16(A + ii * lda + kk, lda, mc, kc, (uint8_t *)A_pack); + packNormal_q4_fp16(A + ii * lda + kk, lda, mc, k_cur, (uint8_t *)A_pack); } else { - packNormal_q8_fp16(A + ii * lda + kk, lda, mc, kc, (uint8_t *)A_pack); + packNormal_q8_fp16(A + ii * lda + kk, lda, mc, k_cur, (uint8_t *)A_pack); } - packNormal_q8_fp16(B + jj * ldb + kk, ldb, nc, kc, (uint8_t *)B_pack); - KERNEL_Q0(ii, jj, mc, nc, kc, kk, A_pack, B_pack); + packNormal_q8_fp16(B + jj * ldb + kk, ldb, nc, k_cur, (uint8_t *)B_pack); + KERNEL_Q0(ii, jj, mc, nc, k_cur, kk, A_pack, B_pack); } } } diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index 74611dce7f1a..902515840b51 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -665,6 +665,7 @@ void ggml_compute_forward_add( ggml_compute_forward_add_non_quantized(params, dst); } break; case GGML_TYPE_Q1_0: + case GGML_TYPE_Q2_0: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: @@ -1115,6 +1116,7 @@ void ggml_compute_forward_add1( } } break; case GGML_TYPE_Q1_0: + case GGML_TYPE_Q2_0: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: @@ -1245,6 +1247,7 @@ void ggml_compute_forward_acc( case GGML_TYPE_F16: case GGML_TYPE_BF16: case GGML_TYPE_Q1_0: + case GGML_TYPE_Q2_0: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: @@ -3688,8 +3691,6 @@ static void ggml_compute_forward_norm_f32( GGML_ASSERT(ggml_are_same_shape(src0, dst)); - GGML_ASSERT(src0->nb[0] == sizeof(float)); - const int ith = params->ith; const int nth = params->nth; @@ -3703,25 +3704,49 @@ static void ggml_compute_forward_norm_f32( for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { for (int64_t i01 = ith; i01 < ne01; i01 += nth) { - const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); + const char * x = (const char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03; + char * y = (char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3; - float sum = 0.0; - ggml_vec_sum_f32(ne00, &sum, x); - float mean = sum/ne00; + if (nb00 == sizeof(float) && nb0 == sizeof(float)) { + const float * xf = (const float *) x; - float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3); - float variance = 0; + float sum = 0.0; + ggml_vec_sum_f32(ne00, &sum, xf); + float mean = sum/ne00; + + float * yf = (float *) y; + float variance = 0; #ifdef GGML_USE_ACCELERATE - mean = -mean; - vDSP_vsadd(x, 1, &mean, y, 1, ne00); - vDSP_measqv(y, 1, &variance, ne00); + mean = -mean; + vDSP_vsadd(xf, 1, &mean, yf, 1, ne00); + vDSP_measqv(yf, 1, &variance, ne00); #else - variance = ggml_vec_cvar_f32(ne00, y, x, mean); + variance = ggml_vec_cvar_f32(ne00, yf, xf, mean); #endif //GGML_USE_ACCELERATE - const float scale = 1.0f/sqrtf(variance + eps); - ggml_vec_scale_f32(ne00, y, scale); + const float scale = 1.0f/sqrtf(variance + eps); + ggml_vec_scale_f32(ne00, yf, scale); + } else { + float sum = 0.0; + for (int64_t i00 = 0; i00 < ne00; i00++) { + sum += *(const float *) (x + i00*nb00); + } + const float mean = sum/ne00; + + float variance = 0.0f; + for (int64_t i00 = 0; i00 < ne00; i00++) { + const float v = *(const float *) (x + i00*nb00) - mean; + *(float *) (y + i00*nb0) = v; + variance += v * v; + } + variance /= ne00; + + const float scale = 1.0f/sqrtf(variance + eps); + for (int64_t i00 = 0; i00 < ne00; i00++) { + *(float *) (y + i00*nb0) *= scale; + } + } } } } @@ -4142,8 +4167,6 @@ static void ggml_compute_forward_l2_norm_f32( GGML_ASSERT(ggml_are_same_shape(src0, dst)); - GGML_ASSERT(src0->nb[0] == sizeof(float)); - const int ith = params->ith; const int nth = params->nth; @@ -4158,20 +4181,27 @@ static void ggml_compute_forward_l2_norm_f32( for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { for (int64_t i01 = ith; i01 < ne01; i01 += nth) { - const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); + const char * x = (const char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03; ggml_float sum = 0.0; for (int64_t i00 = 0; i00 < ne00; i00++) { - sum += (ggml_float)(x[i00] * x[i00]); + const float xi = *(const float *) (x + i00*nb00); + sum += (ggml_float)(xi * xi); } - float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3); - - memcpy(y, x, ne00 * sizeof(float)); - const float scale = 1.0f/fmaxf(sqrtf(sum), eps); - ggml_vec_scale_f32(ne00, y, scale); + char * y = (char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3; + + if (nb00 == sizeof(float) && nb0 == sizeof(float)) { + memcpy(y, x, ne00 * sizeof(float)); + ggml_vec_scale_f32(ne00, (float *) y, scale); + } else { + for (int64_t i00 = 0; i00 < ne00; i00++) { + const float xi = *(const float *) (x + i00*nb00); + *(float *) (y + i00*nb0) = xi * scale; + } + } } } } @@ -4415,6 +4445,7 @@ void ggml_compute_forward_out_prod( switch (src0->type) { case GGML_TYPE_Q1_0: + case GGML_TYPE_Q2_0: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: @@ -4691,6 +4722,7 @@ void ggml_compute_forward_set( case GGML_TYPE_F16: case GGML_TYPE_BF16: case GGML_TYPE_Q1_0: + case GGML_TYPE_Q2_0: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: @@ -4915,6 +4947,7 @@ void ggml_compute_forward_get_rows( switch (src0->type) { case GGML_TYPE_Q1_0: + case GGML_TYPE_Q2_0: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: @@ -5641,6 +5674,7 @@ void ggml_compute_forward_clamp( } break; case GGML_TYPE_BF16: case GGML_TYPE_Q1_0: + case GGML_TYPE_Q2_0: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: diff --git a/ggml/src/ggml-cpu/quants.c b/ggml/src/ggml-cpu/quants.c index e5f9a4083f9c..5e36459f8cbc 100644 --- a/ggml/src/ggml-cpu/quants.c +++ b/ggml/src/ggml-cpu/quants.c @@ -26,6 +26,10 @@ void quantize_row_q1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in quantize_row_q1_0_ref(x, y, k); } +void quantize_row_q2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) { + quantize_row_q2_0_ref(x, y, k); +} + void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k) { quantize_row_q4_0_ref(x, y, k); } @@ -170,6 +174,53 @@ void ggml_vec_dot_q1_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, c *s = sumf; } +void ggml_vec_dot_q2_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + const int qk = QK2_0; + const int nb = n / qk; + + assert(n % qk == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q2_0 * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; + + float sumf = 0.0f; + + for (int i = 0; i < nb; i++) { + const float d0 = GGML_CPU_FP16_TO_FP32(x[i].d); + + float sumi = 0.0f; + + // group 64: one Q2_0 block (64 weights) maps to two Q8_0 blocks (2 * 32 = 64) + for (int k = 0; k < 2; k++) { + const block_q8_0 * GGML_RESTRICT yb = &y[i * 2 + k]; + const float d1 = GGML_CPU_FP16_TO_FP32(yb->d); + int sumi_block = 0; + + const uint8_t * GGML_RESTRICT qs = &x[i].qs[k * 8]; + const int8_t * GGML_RESTRICT qy = yb->qs; + + for (int b = 0; b < 8; ++b) { + const uint8_t byte = qs[b]; + // Extract 4 two-bit values, map {0,1,2,3} -> {-1,0,1,2} + sumi_block += ((int)((byte >> 0) & 3) - 1) * qy[b*4 + 0]; + sumi_block += ((int)((byte >> 2) & 3) - 1) * qy[b*4 + 1]; + sumi_block += ((int)((byte >> 4) & 3) - 1) * qy[b*4 + 2]; + sumi_block += ((int)((byte >> 6) & 3) - 1) * qy[b*4 + 3]; + } + + sumi += d1 * sumi_block; + } + + sumf += d0 * sumi; + } + + *s = sumf; +} void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { const int qk = QK8_0; diff --git a/ggml/src/ggml-cpu/quants.h b/ggml/src/ggml-cpu/quants.h index d4bc87a1c052..93ea7eeffe5b 100644 --- a/ggml/src/ggml-cpu/quants.h +++ b/ggml/src/ggml-cpu/quants.h @@ -13,6 +13,7 @@ extern "C" { // Quantization void quantize_row_q1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +void quantize_row_q2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); @@ -38,6 +39,7 @@ void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, // Dot product void ggml_vec_dot_q1_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_q2_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); @@ -71,6 +73,7 @@ void quantize_row_q8_0_generic(const float * GGML_RESTRICT x, void * GGML_RESTRI void quantize_row_q8_1_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k); void quantize_row_q8_K_generic(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); void ggml_vec_dot_q1_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_q2_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_q4_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_q4_1_q8_1_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_q5_0_q8_0_generic(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); diff --git a/ggml/src/ggml-cpu/vec.cpp b/ggml/src/ggml-cpu/vec.cpp index 67b6b05cac81..ff2b636df86c 100644 --- a/ggml/src/ggml-cpu/vec.cpp +++ b/ggml/src/ggml-cpu/vec.cpp @@ -75,12 +75,12 @@ void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * G ay1 = GGML_F32_VEC_LOAD(y + i); sum1 = GGML_F32_VEC_FMA(sum1, ax1, ay1); } - // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmad on available elements only + // maximum number of leftover elements will be less that ggml_f32_epr. Apply predicated svmla on available elements only if (np2 < n) { svbool_t pg = svwhilelt_b32(np2, n); ax1 = svld1_f32(pg, x + np2); ay1 = svld1_f32(pg, y + np2); - sum1 = svmad_f32_m(pg, ax1, ay1, sum1); + sum1 = svmla_f32_m(pg, sum1, ax1, ay1); } // reduce sum1,sum2 to sum1 GGML_F32_VEC_REDUCE(sumf, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum8); diff --git a/ggml/src/ggml-cuda/binbcast.cu b/ggml/src/ggml-cuda/binbcast.cu index c25f42b32bbc..2e38077bf67f 100644 --- a/ggml/src/ggml-cuda/binbcast.cu +++ b/ggml/src/ggml-cuda/binbcast.cu @@ -34,26 +34,26 @@ template = (uint32_t)ne0 || i1 >= (uint32_t)ne1 || i2 >= (uint32_t)ne2 || i3 >= ne3.z) { + if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3.z) { return; } @@ -69,25 +69,32 @@ static __global__ void k_bin_bcast(const src0_t * src0, const uint32_t i12 = fastmodulo(i2, ne12); const uint32_t i13 = fastmodulo(i3, ne13); - const size_t i_src0 = i3*s03 + i2*s02 + i1*s01; - const size_t i_src1 = i13*s13 + i12*s12 + i11*s11; - const size_t i_dst = i3*s3 + i2*s2 + i1*s1; + const size_t i_src0 = size_t( i3)*s03 + size_t( i2)*s02 + size_t( i1)*s01; + const size_t i_src1 = size_t(i13)*s13 + size_t(i12)*s12 + size_t(i11)*s11; + const size_t i_dst = size_t( i3)*s3 + size_t( i2)*s2 + size_t( i1)*s1; const src0_t * src0_row = src0 ? (src0 + i_src0) : nullptr; dst_t * dst_row = dst + i_dst; + const uint32_t s0 = blockDim.x * gridDim.x; + ggml_cuda_pdl_sync(); - for (int i0 = i0s; i0 < ne0; i0 += blockDim.x * gridDim.x) { + for (uint32_t i0 = i0s; i0 < ne0; i0 += s0) { const uint32_t i10 = fastmodulo(i0, ne10); - float result = src0_row ? (float) src0_row[i0*s00] : 0.0f; + float result = src0_row ? (float) src0_row[size_t(i0)*s00] : 0.0f; if constexpr (sizeof...(src1_ptrs) > 0) { - result = (..., (result = bin_op(result, (float)src1s[i_src1 + i10*s10]))); + result = (..., (result = bin_op(result, (float)src1s[i_src1 + size_t(i10)*s10]))); } else { - result = bin_op(result, (float)src1[i_src1 + i10*s10]); + result = bin_op(result, (float)src1[i_src1 + size_t(i10)*s10]); } dst_row[i0] = (dst_t) result; + + // protect i0 from overflow + if (ne0 - i0 <= s0) { + break; + } } } @@ -110,19 +117,19 @@ static __global__ void k_bin_bcast_unravel(const src0_t * src0, const uint3 ne12, const uint3 ne13, /*const int s0,*/ - const int s1, - const int s2, - const int s3, - const int s00, - const int s01, - const int s02, - const int s03, - const int s10, - const int s11, - const int s12, - const int s13, + const uint32_t s1, + const uint32_t s2, + const uint32_t s3, + const uint32_t s00, + const uint32_t s01, + const uint32_t s02, + const uint32_t s03, + const uint32_t s10, + const uint32_t s11, + const uint32_t s12, + const uint32_t s13, src1_ptrs... src1s) { - const int i = blockDim.x*blockIdx.x + threadIdx.x; + const uint32_t i = blockDim.x*blockIdx.x + threadIdx.x; const uint32_t i3 = fastdiv(i, prod_012); const uint32_t i2 = fastdiv(i - i3 * prod_012.z, prod_01); @@ -133,25 +140,25 @@ static __global__ void k_bin_bcast_unravel(const src0_t * src0, return; } - const int i11 = fastmodulo(i1, ne11); - const int i12 = fastmodulo(i2, ne12); - const int i13 = fastmodulo(i3, ne13); + const uint32_t i11 = fastmodulo(i1, ne11); + const uint32_t i12 = fastmodulo(i2, ne12); + const uint32_t i13 = fastmodulo(i3, ne13); - const size_t i_src0 = i3*s03 + i2*s02 + i1*s01; - const size_t i_src1 = i13*s13 + i12*s12 + i11*s11; - const size_t i_dst = i3*s3 + i2*s2 + i1*s1; + const size_t i_src0 = size_t( i3)*s03 + size_t( i2)*s02 + size_t( i1)*s01; + const size_t i_src1 = size_t(i13)*s13 + size_t(i12)*s12 + size_t(i11)*s11; + const size_t i_dst = size_t( i3)*s3 + size_t( i2)*s2 + size_t( i1)*s1; const src0_t * src0_row = src0 ? (src0 + i_src0) : nullptr; dst_t * dst_row = dst + i_dst; - const int i10 = fastmodulo(i0, ne10); + const uint32_t i10 = fastmodulo(i0, ne10); ggml_cuda_pdl_sync(); - float result = src0_row ? (float) src0_row[i0*s00] : 0.0f; + float result = src0_row ? (float) src0_row[size_t(i0)*s00] : 0.0f; if constexpr (sizeof...(src1_ptrs) > 0) { - result = (..., (result = bin_op(result, (float)src1s[i_src1 + i10*s10]))); + result = (..., (result = bin_op(result, (float)src1s[i_src1 + size_t(i10)*s10]))); } else { - result = bin_op(result, (float)src1[i_src1 + i10*s10]); + result = bin_op(result, (float)src1[i_src1 + size_t(i10)*s10]); } dst_row[i0] = (dst_t) result; @@ -248,6 +255,31 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor * size_t s02 = nb02 / sizeof(src0_t); size_t s03 = nb03 / sizeof(src0_t); + GGML_ASSERT(ne0 <= std::numeric_limits::max()); + GGML_ASSERT(ne1 <= std::numeric_limits::max()); + GGML_ASSERT(ne2 <= std::numeric_limits::max()); + GGML_ASSERT(ne3 <= std::numeric_limits::max()); + + //GGML_ASSERT(s0 <= std::numeric_limits::max()); + GGML_ASSERT(s1 <= std::numeric_limits::max()); + GGML_ASSERT(s2 <= std::numeric_limits::max()); + GGML_ASSERT(s3 <= std::numeric_limits::max()); + + GGML_ASSERT(s00 <= std::numeric_limits::max()); + GGML_ASSERT(s01 <= std::numeric_limits::max()); + GGML_ASSERT(s02 <= std::numeric_limits::max()); + GGML_ASSERT(s03 <= std::numeric_limits::max()); + + GGML_ASSERT(s10 <= std::numeric_limits::max()); + GGML_ASSERT(s11 <= std::numeric_limits::max()); + GGML_ASSERT(s12 <= std::numeric_limits::max()); + GGML_ASSERT(s13 <= std::numeric_limits::max()); + + GGML_ASSERT(cne1[0] <= std::numeric_limits::max()); + GGML_ASSERT(cne1[1] <= std::numeric_limits::max()); + GGML_ASSERT(cne1[2] <= std::numeric_limits::max()); + GGML_ASSERT(cne1[3] <= std::numeric_limits::max()); + GGML_ASSERT(nb0 % sizeof(dst_t) == 0); GGML_ASSERT(nb1 % sizeof(dst_t) == 0); GGML_ASSERT(nb2 % sizeof(dst_t) == 0); @@ -263,6 +295,8 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor * GGML_ASSERT(nb12 % sizeof(src1_t) == 0); GGML_ASSERT(nb13 % sizeof(src1_t) == 0); + GGML_ASSERT(ne2 * ne3 <= std::numeric_limits::max()); + const int block_size = 128; int64_t hne0 = std::max(ne0 / 2LL, 1LL); @@ -281,7 +315,13 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor * const uint3 ne13 = init_fastdiv_values((uint32_t) cne1[3]); if (block_nums.z > 65535 || block_nums.y > 65535) { - int block_num = (ne0 * ne1 * ne2 * ne3 + block_size - 1) / block_size; + int64_t block_num = (ne0 * ne1 * ne2 * ne3 + block_size - 1) / block_size; + + GGML_ASSERT(block_num <= std::numeric_limits::max()); + GGML_ASSERT(block_num * block_size <= std::numeric_limits::max()); + GGML_ASSERT(ne0 * ne1 <= std::numeric_limits::max()); + GGML_ASSERT(ne0 * ne1 * ne2 <= std::numeric_limits::max()); + const uint3 prod_012 = init_fastdiv_values((uint32_t) (ne0 * ne1 * ne2)); const uint3 prod_01 = init_fastdiv_values((uint32_t) (ne0 * ne1)); const uint3 ne0_fastdiv = init_fastdiv_values((uint32_t) ne0); @@ -298,6 +338,10 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor * s10, s11, s12, s13, (const src1_t *) dst->src[I + 1]->data...); } } else { + GGML_ASSERT(int64_t(block_nums.x) * block_dims.x <= std::numeric_limits::max()); + GGML_ASSERT(int64_t(block_nums.y) * block_dims.y <= std::numeric_limits::max()); + GGML_ASSERT(int64_t(block_nums.z) * block_dims.z <= std::numeric_limits::max()); + const uint3 ne3_fastdiv = init_fastdiv_values((uint32_t) ne3); { const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(block_nums, block_dims, 0, stream); diff --git a/ggml/src/ggml-cuda/col2im-1d.cu b/ggml/src/ggml-cuda/col2im-1d.cu new file mode 100644 index 000000000000..fecd4c6a95d2 --- /dev/null +++ b/ggml/src/ggml-cuda/col2im-1d.cu @@ -0,0 +1,81 @@ +#include "col2im-1d.cuh" +#include "convert.cuh" + +// col2im_1d: scatter-add GEMM columns to 1D signal (gather approach) +// columns: [K*OC, T_in] -> output: [T_out, OC] +// Supports F32, F16, BF16 data with F32 accumulator. + +template +static __global__ void col2im_1d_kernel( + const T * __restrict__ col, + T * __restrict__ dst, + const int T_in, const uint3 T_out_fd, + const int OC, const int K, const int K_OC, + const int s0, const int p0, const int total) { + + const int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx >= total) return; + + // dst layout: [T_out, OC], ne[0]=T_out fastest + const uint2 qr = fast_div_modulo((uint32_t)idx, T_out_fd); // qr.x = idx / T_out, qr.y = idx % T_out + const int oc = (int)qr.x; + const int t_out = (int)qr.y; + const int t_abs = t_out + p0; // absolute position in uncropped signal + + // Gather: find all (t_in, k) where t_in*s + k == t_abs, 0 <= k < K + int t_in_min = (t_abs - K + s0) / s0; // ceil((t_abs - K + 1) / s) + if (t_in_min < 0) t_in_min = 0; + int t_in_max = t_abs / s0; + if (t_in_max >= T_in) t_in_max = T_in - 1; + + float sum = 0.0f; + for (int t_in = t_in_min; t_in <= t_in_max; t_in++) { + const int k = t_abs - t_in * s0; + // col layout: [K*OC, T_in], column index = oc * K + k + sum += ggml_cuda_cast(col[(oc * K + k) + t_in * K_OC]); + } + + dst[idx] = ggml_cuda_cast(sum); +} + +void ggml_cuda_op_col2im_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + cudaStream_t stream = ctx.stream(); + + GGML_ASSERT(ggml_is_contiguous(src0)); + + const int32_t s0 = ((const int32_t *)(dst->op_params))[0]; + const int32_t OC = ((const int32_t *)(dst->op_params))[1]; + const int32_t p0 = ((const int32_t *)(dst->op_params))[2]; + + const int K_OC = (int) src0->ne[0]; + const int T_in = (int) src0->ne[1]; + const int K = K_OC / OC; + const int T_out = (int) dst->ne[0]; + + const uint3 T_out_fd = init_fastdiv_values((uint32_t)T_out); + + const int total = T_out * OC; + const int block_size = 256; + const int num_blocks = (total + block_size - 1) / block_size; + + switch (src0->type) { + case GGML_TYPE_F32: { + col2im_1d_kernel<<>>( + (const float *)src0->data, (float *)dst->data, + T_in, T_out_fd, OC, K, K_OC, s0, p0, total); + } break; + case GGML_TYPE_F16: { + col2im_1d_kernel<<>>( + (const half *)src0->data, (half *)dst->data, + T_in, T_out_fd, OC, K, K_OC, s0, p0, total); + } break; + case GGML_TYPE_BF16: { + col2im_1d_kernel<<>>( + (const nv_bfloat16 *)src0->data, (nv_bfloat16 *)dst->data, + T_in, T_out_fd, OC, K, K_OC, s0, p0, total); + } break; + default: + GGML_ABORT("col2im_1d: unsupported type"); + } +} diff --git a/ggml/src/ggml-cuda/col2im-1d.cuh b/ggml/src/ggml-cuda/col2im-1d.cuh new file mode 100644 index 000000000000..efc3313c4d11 --- /dev/null +++ b/ggml/src/ggml-cuda/col2im-1d.cuh @@ -0,0 +1,3 @@ +#include "common.cuh" + +void ggml_cuda_op_col2im_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml-cuda/concat.cu b/ggml/src/ggml-cuda/concat.cu index adba4d522a42..8d557092b2be 100644 --- a/ggml/src/ggml-cuda/concat.cu +++ b/ggml/src/ggml-cuda/concat.cu @@ -1,16 +1,18 @@ #include "concat.cuh" +#include + // contiguous kernels -template -static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE) concat_f32_cont(const float * x, - const float * y, - float * dst, - int64_t ne00, - int64_t ne01, - int64_t ne02, - int64_t ne0, - int64_t ne1, - int64_t ne2) { +template +static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE) concat_cont(const T * x, + const T * y, + T * dst, + int64_t ne00, + int64_t ne01, + int64_t ne02, + int64_t ne0, + int64_t ne1, + int64_t ne2) { static_assert(dim >= 0 && dim <= 2, "dim must be in [0, 2]"); const int64_t n = ne0 * ne1 * ne2; @@ -50,37 +52,37 @@ static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE) concat_f32_cont } } -static void concat_f32_cuda(const float * x, - const float * y, - float * dst, - int64_t ne00, - int64_t ne01, - int64_t ne02, - int64_t ne0, - int64_t ne1, - int64_t ne2, - int dim, - cudaStream_t stream) { +template +static void concat_cont_cuda(const T * x, + const T * y, + T * dst, + int64_t ne00, + int64_t ne01, + int64_t ne02, + int64_t ne0, + int64_t ne1, + int64_t ne2, + int dim, + cudaStream_t stream) { const int64_t n = ne0 * ne1 * ne2; const int num_blocks = (n + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE; if (dim == 0) { const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(num_blocks, CUDA_CONCAT_BLOCK_SIZE, 0, stream); - ggml_cuda_kernel_launch(concat_f32_cont<0>, launch_params,x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2); + ggml_cuda_kernel_launch(concat_cont, launch_params, x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2); return; } if (dim == 1) { - concat_f32_cont<1> - <<>>(x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2); + concat_cont<<>>(x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2); return; } - concat_f32_cont<2><<>>(x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2); + concat_cont<<>>(x, y, dst, ne00, ne01, ne02, ne0, ne1, ne2); } // non-contiguous kernel (slow) -template +template static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE) - concat_f32_non_cont( + concat_non_cont( const char * src0, const char * src1, char * dst, @@ -107,61 +109,49 @@ static __global__ void __launch_bounds__(CUDA_CONCAT_BLOCK_SIZE) uint64_t nb0, uint64_t nb1, uint64_t nb2, - uint64_t nb3){ + uint64_t nb3) { static_assert(dim >= 0 && dim <= 3, "dim must be in [0, 3]"); const int64_t i3 = blockIdx.z; const int64_t i2 = blockIdx.y; const int64_t i1 = blockIdx.x; - const float * x; + const T * x; for (int64_t i0 = threadIdx.x; i0 < ne0; i0 += blockDim.x) { if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) { - x = (const float *)(src0 + (i3 )*nb03 + (i2 )*nb02 + (i1 )*nb01 + (i0 )*nb00); + x = (const T *)(src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00); } else { if constexpr (dim == 0) { - x = (const float *) (src1 + i3 * nb13 + i2 * nb12 + i1 * nb11 + (i0 - ne00) * nb10); + x = (const T *)(src1 + i3*nb13 + i2*nb12 + i1*nb11 + (i0 - ne00)*nb10); } else if constexpr (dim == 1) { - x = (const float *) (src1 + i3 * nb13 + i2 * nb12 + (i1 - ne01) * nb11 + i0 * nb10); + x = (const T *)(src1 + i3*nb13 + i2*nb12 + (i1 - ne01)*nb11 + i0*nb10); } else if constexpr (dim == 2) { - x = (const float *) (src1 + i3 * nb13 + (i2 - ne02) * nb12 + i1 * nb11 + i0 * nb10); + x = (const T *)(src1 + i3*nb13 + (i2 - ne02)*nb12 + i1*nb11 + i0*nb10); } else if constexpr (dim == 3) { - x = (const float *) (src1 + (i3 - ne03) * nb13 + i2 * nb12 + i1 * nb11 + i0 * nb10); + x = (const T *)(src1 + (i3 - ne03)*nb13 + i2*nb12 + i1*nb11 + i0*nb10); } } - float * y = (float *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + T * y = (T *)(dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); *y = *x; } } - -void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { - const ggml_tensor * src0 = dst->src[0]; - const ggml_tensor * src1 = dst->src[1]; - - cudaStream_t stream = ctx.stream(); - - const int32_t dim = ((int32_t *) dst->op_params)[0]; - - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); - +template +static void concat_cuda(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, int dim, cudaStream_t stream) { if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) { - const float * src0_d = (const float *)src0->data; - const float * src1_d = (const float *)src1->data; - - float * dst_d = (float *)dst->data; + const T * src0_d = (const T *) src0->data; + const T * src1_d = (const T *) src1->data; + T * dst_d = (T *) dst->data; if (dim != 3) { - for (int i3 = 0; i3 < dst->ne[3]; i3++) { - concat_f32_cuda( - src0_d + i3 * (src0->nb[3] / 4), - src1_d + i3 * (src1->nb[3] / 4), - dst_d + i3 * ( dst->nb[3] / 4), + for (int64_t i3 = 0; i3 < dst->ne[3]; i3++) { + concat_cont_cuda( + src0_d + i3*(src0->nb[3] / sizeof(T)), + src1_d + i3*(src1->nb[3] / sizeof(T)), + dst_d + i3*( dst->nb[3] / sizeof(T)), src0->ne[0], src0->ne[1], src0->ne[2], dst->ne[0], dst->ne[1], dst->ne[2], dim, stream); } @@ -169,13 +159,13 @@ void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const size_t size0 = ggml_nbytes(src0); const size_t size1 = ggml_nbytes(src1); - CUDA_CHECK(cudaMemcpyAsync(dst_d, src0_d, size0, cudaMemcpyDeviceToDevice, stream)); - CUDA_CHECK(cudaMemcpyAsync(dst_d + size0/4, src1_d, size1, cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync((char *) dst->data, src0->data, size0, cudaMemcpyDeviceToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync((char *) dst->data + size0, src1->data, size1, cudaMemcpyDeviceToDevice, stream)); } } else { dim3 grid_dim(dst->ne[1], dst->ne[2], dst->ne[3]); auto launch_kernel = [&](auto dim) { - concat_f32_non_cont<<>>( + concat_non_cont<<>>( (const char *) src0->data, (const char *) src1->data, (char *) dst->data, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], @@ -203,3 +193,35 @@ void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { } } } + +void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + + cudaStream_t stream = ctx.stream(); + + const int32_t dim = ((int32_t *) dst->op_params)[0]; + + GGML_ASSERT(src0->type == src1->type); + GGML_ASSERT(dst->type == src0->type); + GGML_ASSERT(!ggml_is_quantized(src0->type)); + GGML_ASSERT(ggml_blck_size(src0->type) == 1); + + switch (ggml_type_size(src0->type)) { + case 1: + concat_cuda(src0, src1, dst, dim, stream); + break; + case 2: + concat_cuda(src0, src1, dst, dim, stream); + break; + case 4: + concat_cuda(src0, src1, dst, dim, stream); + break; + case 8: + concat_cuda(src0, src1, dst, dim, stream); + break; + default: + GGML_ABORT("Unsupported type size: %zu", ggml_type_size(src0->type)); + break; + } +} diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu index 121472ec2283..eb5eb0eb4ebc 100644 --- a/ggml/src/ggml-cuda/cpy.cu +++ b/ggml/src/ggml-cuda/cpy.cu @@ -53,10 +53,10 @@ static __global__ void cpy_scalar_transpose(const char * cx, char * cdst, const const int64_t nmat = ne / (ne00 * ne01); const int64_t n = ne00 * ne01; - const int x = blockIdx.x * CUDA_CPY_TILE_DIM_2D + threadIdx.x; - const int y = blockIdx.y * CUDA_CPY_TILE_DIM_2D + threadIdx.y; - const int tx = blockIdx.y * CUDA_CPY_TILE_DIM_2D + threadIdx.x; // transpose block offset - const int ty = blockIdx.x * CUDA_CPY_TILE_DIM_2D + threadIdx.y; + const int64_t x = (int64_t) blockIdx.x * CUDA_CPY_TILE_DIM_2D + threadIdx.x; + const int64_t y = (int64_t) blockIdx.y * CUDA_CPY_TILE_DIM_2D + threadIdx.y; + const int64_t tx = (int64_t) blockIdx.y * CUDA_CPY_TILE_DIM_2D + threadIdx.x; // transpose block offset + const int64_t ty = (int64_t) blockIdx.x * CUDA_CPY_TILE_DIM_2D + threadIdx.y; __shared__ float tile[2][CUDA_CPY_TILE_DIM_2D][CUDA_CPY_TILE_DIM_2D+1]; int cur_tile_buf = 0; @@ -197,7 +197,7 @@ static void ggml_cpy_scalar_contiguous_cuda( cudaStream_t stream) { const int64_t num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE; - GGML_ASSERT(num_blocks < UINT_MAX); + GGML_ASSERT(num_blocks <= INT_MAX); const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params((dim3)num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream); ggml_cuda_kernel_launch(cpy_scalar_contiguous, launch_params, cx, cdst, ne); } @@ -208,6 +208,14 @@ static void ggml_cpy_scalar_cuda( const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t nb00, const int64_t nb01, const int64_t nb02, const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) { + const auto launch_scalar_generic = [&]() { + const int64_t num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE; + GGML_ASSERT(num_blocks <= INT_MAX); + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params((dim3)num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream); + ggml_cuda_kernel_launch(cpy_scalar>, launch_params, + cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); + }; + if (transposed) { GGML_ASSERT(ne == ne00*ne01*ne02); // ne[3] is 1 assumed int64_t ne00n, ne01n, ne02n; @@ -224,20 +232,18 @@ static void ggml_cpy_scalar_cuda( int64_t grid_x = (ne01n + CUDA_CPY_TILE_DIM_2D - 1) / CUDA_CPY_TILE_DIM_2D; int64_t grid_y = (ne00n + CUDA_CPY_TILE_DIM_2D - 1) / CUDA_CPY_TILE_DIM_2D; int64_t grid_z = (ne/(ne01n*ne00n) + CUDA_CPY_BLOCK_NM - 1) / CUDA_CPY_BLOCK_NM; - GGML_ASSERT(grid_x < UINT_MAX); - GGML_ASSERT(grid_y < USHRT_MAX); - GGML_ASSERT(grid_z < USHRT_MAX); - dim3 dimGrid(grid_x, grid_y, grid_z); - dim3 dimBlock(CUDA_CPY_TILE_DIM_2D, CUDA_CPY_BLOCK_ROWS, 1); - const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(dimGrid, dimBlock, 0, stream); - ggml_cuda_kernel_launch(cpy_scalar_transpose, launch_params, - cx, cdst, ne, ne00n, ne01n, ne02n, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); + GGML_ASSERT(grid_x <= INT_MAX); + if (grid_y > USHRT_MAX || grid_z > USHRT_MAX) { + launch_scalar_generic(); + } else { + dim3 dimGrid(grid_x, grid_y, grid_z); + dim3 dimBlock(CUDA_CPY_TILE_DIM_2D, CUDA_CPY_BLOCK_ROWS, 1); + const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params(dimGrid, dimBlock, 0, stream); + ggml_cuda_kernel_launch(cpy_scalar_transpose, launch_params, + cx, cdst, ne, ne00n, ne01n, ne02n, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); + } } else { - const int64_t num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE; - GGML_ASSERT(num_blocks < UINT_MAX); - const ggml_cuda_kernel_launch_params launch_params = ggml_cuda_kernel_launch_params((dim3)num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream); - ggml_cuda_kernel_launch(cpy_scalar>, launch_params, - cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); + launch_scalar_generic(); } } @@ -248,7 +254,7 @@ static void ggml_cpy_f32_q8_0_cuda( GGML_ASSERT(ne % QK8_0 == 0); const int64_t num_blocks = ne / QK8_0; - GGML_ASSERT(num_blocks < UINT_MAX); + GGML_ASSERT(num_blocks <= INT_MAX); cpy_f32_q<<>> (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); } @@ -259,7 +265,7 @@ static void ggml_cpy_q8_0_f32_cuda( const int64_t nb03, const int64_t ne10, const int64_t ne11, const int64_t ne12, const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) { const int64_t num_blocks = ne; - GGML_ASSERT(num_blocks < UINT_MAX); + GGML_ASSERT(num_blocks <= INT_MAX); cpy_q_f32<<>> (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); } @@ -271,7 +277,7 @@ static void ggml_cpy_f32_q4_0_cuda( GGML_ASSERT(ne % QK4_0 == 0); const int64_t num_blocks = ne / QK4_0; - GGML_ASSERT(num_blocks < UINT_MAX); + GGML_ASSERT(num_blocks <= INT_MAX); cpy_f32_q<<>> (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); } @@ -284,7 +290,7 @@ static void ggml_cpy_q4_0_f32_cuda( const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) { const int64_t num_blocks = ne; - GGML_ASSERT(num_blocks < UINT_MAX); + GGML_ASSERT(num_blocks <= INT_MAX); cpy_q_f32, QK4_0><<>>( cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); @@ -297,7 +303,7 @@ static void ggml_cpy_f32_q4_1_cuda( GGML_ASSERT(ne % QK4_1 == 0); const int64_t num_blocks = ne / QK4_1; - GGML_ASSERT(num_blocks < UINT_MAX); + GGML_ASSERT(num_blocks <= INT_MAX); cpy_f32_q<<>> (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); } @@ -310,7 +316,7 @@ static void ggml_cpy_q4_1_f32_cuda( const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) { const int64_t num_blocks = ne; - GGML_ASSERT(num_blocks < UINT_MAX); + GGML_ASSERT(num_blocks <= INT_MAX); cpy_q_f32, QK4_1><<>>( cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); @@ -323,7 +329,7 @@ static void ggml_cpy_f32_q5_0_cuda( GGML_ASSERT(ne % QK5_0 == 0); const int64_t num_blocks = ne / QK5_0; - GGML_ASSERT(num_blocks < UINT_MAX); + GGML_ASSERT(num_blocks <= INT_MAX); cpy_f32_q<<>> (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); } @@ -336,7 +342,7 @@ static void ggml_cpy_q5_0_f32_cuda( const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) { const int64_t num_blocks = ne; - GGML_ASSERT(num_blocks < UINT_MAX); + GGML_ASSERT(num_blocks <= INT_MAX); cpy_q_f32, QK5_0><<>>( cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); @@ -349,7 +355,7 @@ static void ggml_cpy_f32_q5_1_cuda( GGML_ASSERT(ne % QK5_1 == 0); const int64_t num_blocks = ne / QK5_1; - GGML_ASSERT(num_blocks < UINT_MAX); + GGML_ASSERT(num_blocks <= INT_MAX); cpy_f32_q<<>> (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); } @@ -362,7 +368,7 @@ static void ggml_cpy_q5_1_f32_cuda( const int64_t nb10, const int64_t nb11, const int64_t nb12, const int64_t nb13, cudaStream_t stream) { const int64_t num_blocks = ne; - GGML_ASSERT(num_blocks < UINT_MAX); + GGML_ASSERT(num_blocks <= INT_MAX); cpy_q_f32, QK5_1><<>>( cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); @@ -375,11 +381,51 @@ static void ggml_cpy_f32_iq4_nl_cuda( GGML_ASSERT(ne % QK4_NL == 0); const int64_t num_blocks = ne / QK4_NL; - GGML_ASSERT(num_blocks < UINT_MAX); + GGML_ASSERT(num_blocks <= INT_MAX); cpy_f32_q<<>> (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); } +// check if a same-type copy reduces to a 2D strided copy (height rows of width +// contiguous bytes), so it can use cudaMemcpy2DAsync instead of the scalar kernel +static bool ggml_cuda_cpy_as_memcpy_2d(const ggml_tensor * src0, const ggml_tensor * src1, + size_t & width, size_t & height, size_t & spitch, size_t & dpitch) { + // require matching shape: a reshaped copy maps elements by flat order, which the + // prefix walk below does not handle + if (src0->type != src1->type || !ggml_are_same_shape(src0, src1)) { + return false; + } + + // grow the contiguous prefix block shared by both tensors + size_t block_nb = ggml_element_size(src0); + int d = 0; + for (; d < GGML_MAX_DIMS; ++d) { + if (src0->nb[d] != block_nb || src1->nb[d] != block_nb) { + break; + } + block_nb *= src0->ne[d]; + } + + // d == 0: nothing contiguous; d == GGML_MAX_DIMS: fully contiguous (handled by memcpy) + if (d == 0 || d == GGML_MAX_DIMS) { + return false; + } + + // dim d carries the rows; everything above it must be a single element + for (int i = d + 1; i < GGML_MAX_DIMS; ++i) { + if (src0->ne[i] != 1) { + return false; + } + } + + width = block_nb; + height = src0->ne[d]; + spitch = src0->nb[d]; + dpitch = src1->nb[d]; + + return spitch >= width && dpitch >= width; +} + void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1) { const int64_t ne = ggml_nelements(src0); GGML_ASSERT(ne == ggml_nelements(src1)); @@ -415,6 +461,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg const bool can_be_transposed = nb01 == (int64_t)ggml_element_size(src0) && src0->ne[3] == 1 && nb02 == ne00 * ne01 * (int64_t)ggml_element_size(src0); + size_t mc_width = 0, mc_height = 0, mc_spitch = 0, mc_dpitch = 0; + if (src0->type == src1->type && contiguous_srcs) { GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1)); #if defined(GGML_USE_MUSA) && defined(GGML_MUSA_MUDNN_COPY) @@ -425,6 +473,9 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg { CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream)); } + } else if (ggml_cuda_cpy_as_memcpy_2d(src0, src1, mc_width, mc_height, mc_spitch, mc_dpitch)) { + CUDA_CHECK(cudaMemcpy2DAsync(src1_ddc, mc_dpitch, src0_ddc, mc_spitch, + mc_width, mc_height, cudaMemcpyDeviceToDevice, main_stream)); } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) { if (can_be_transposed) { ggml_cpy_scalar_cuda diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index e779a9be9e95..f3fb32452d2c 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -11,6 +11,7 @@ #include "ggml-cuda/argsort.cuh" #include "ggml-cuda/binbcast.cuh" #include "ggml-cuda/clamp.cuh" +#include "ggml-cuda/col2im-1d.cuh" #include "ggml-cuda/concat.cuh" #include "ggml-cuda/conv-transpose-1d.cuh" #include "ggml-cuda/conv2d.cuh" @@ -622,18 +623,6 @@ ggml_backend_cuda_context::~ggml_backend_cuda_context() { // cuda buffer -struct ggml_backend_cuda_device_context { - int device; - std::string name; - std::string description; - std::string pci_bus_id; - int op_offload_min_batch_size; -#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) - std::mutex device_mutex; - int active_count = 0; -#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) -}; - struct ggml_backend_cuda_buffer_context { int device; void * dev_ptr = nullptr; @@ -651,13 +640,6 @@ struct ggml_backend_cuda_buffer_context { static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; - -#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) - ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) buffer->buft->device->context; - std::lock_guard lock(dev_ctx->device_mutex); - dev_ctx->active_count--; -#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) - delete ctx; } @@ -810,12 +792,6 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac ggml_backend_cuda_buffer_context * ctx = new ggml_backend_cuda_buffer_context(buft_ctx->device, dev_ptr); -#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) - ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) buft->device->context; - std::lock_guard lock(dev_ctx->device_mutex); - dev_ctx->active_count++; -#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) - return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size); } @@ -1515,12 +1491,6 @@ static bool ggml_backend_buft_is_cuda_host(ggml_backend_buffer_type_t buft) { } static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { -#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) - ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) buffer->buft->device->context; - std::lock_guard lock(dev_ctx->device_mutex); - dev_ctx->active_count--; -#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) - CUDA_CHECK(cudaFreeHost(buffer->context)); } @@ -1529,8 +1499,6 @@ static void * ggml_cuda_host_malloc(size_t size) { return nullptr; } - ggml_cuda_set_device(0); // cudaMallocHost can create the implicit CUDA device context, make sure that this is consistently done on device 0. - void * ptr = nullptr; cudaError_t err = cudaMallocHost((void **) &ptr, size); if (err != cudaSuccess) { @@ -1556,12 +1524,6 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm buffer->buft = buft; buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer; -#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) - ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) buft->device->context; - std::lock_guard lock(dev_ctx->device_mutex); - dev_ctx->active_count++; -#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) - return buffer; } @@ -3090,6 +3052,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg case GGML_OP_CONV_TRANSPOSE_1D: ggml_cuda_op_conv_transpose_1d(ctx,dst); break; + case GGML_OP_COL2IM_1D: + ggml_cuda_op_col2im_1d(ctx, dst); + break; case GGML_OP_POOL_2D: ggml_cuda_op_pool2d(ctx, dst); break; @@ -3179,12 +3144,6 @@ static const char * ggml_backend_cuda_get_name(ggml_backend_t backend) { static void ggml_backend_cuda_free(ggml_backend_t backend) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; -#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) - ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) backend->device->context; - std::lock_guard lock(dev_ctx->device_mutex); - dev_ctx->active_count--; -#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) - delete cuda_ctx; delete backend; } @@ -3233,11 +3192,24 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_ ggml_backend_buffer_t buf_src = src->view_src ? src->view_src->buffer : src->buffer; ggml_backend_buffer_t buf_dst = dst->view_src ? dst->view_src->buffer : dst->buffer; - if (!ggml_backend_is_cuda(backend_src) || !ggml_backend_is_cuda(backend_dst)) { + // Enables async copies from CPU to CUDA, instead of only CUDA-to-CUDA + // Excluding this path for HIP and MUSA as a precaution. + // According to the summary in https://github.com/ggml-org/llama.cpp/pull/20793#issuecomment-4275794315, this change is not beneficial for hip anyways. + // Additionally, there is a lot of anectodal evidence that hip/musa stream behavior might not always 1:1 match CUDA behavior. + // e.g. https://github.com/ROCm/rocm-systems/issues/5109 + // It thus makes sense to exclude this path for HIP and MUSA. This PR was not aimed these backends, the majority of testing happened on CUDA. + // This can be revisited in the future if enabling copy_from_host benefits hip/MUSA, and if the PR author can extensively test on these backends. +#if defined(GGML_USE_HIP) || defined(GGML_USE_MUSA) + const bool copy_from_host = false; +#else + const bool copy_from_host = ggml_backend_buffer_is_host(buf_src) && ggml_backend_dev_type(backend_src->device) == GGML_BACKEND_DEVICE_TYPE_CPU; +#endif + + if (!(copy_from_host || ggml_backend_is_cuda(backend_src)) || !ggml_backend_is_cuda(backend_dst)) { return false; } - if (!ggml_backend_buffer_is_cuda(buf_src) || !ggml_backend_buffer_is_cuda(buf_dst)) { + if (!(copy_from_host || ggml_backend_buffer_is_cuda(buf_src)) || !ggml_backend_buffer_is_cuda(buf_dst)) { return false; } @@ -3248,14 +3220,17 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_ ggml_backend_cuda_buffer_context * buf_ctx_src = (ggml_backend_cuda_buffer_context *) buf_src->context; ggml_backend_cuda_buffer_context * buf_ctx_dst = (ggml_backend_cuda_buffer_context *) buf_dst->context; - if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) { + if ((copy_from_host && cuda_ctx_dst->device != buf_ctx_dst->device) || + !copy_from_host && (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device)) { #ifndef NDEBUG GGML_LOG_DEBUG("%s: backend and buffer devices do not match\n", __func__); #endif // NDEBUG return false; } - if (backend_src != backend_dst) { + if (copy_from_host) { + CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyHostToDevice, cuda_ctx_dst->stream())); + } else if (backend_src != backend_dst) { // copy on src stream if (cuda_ctx_src->device == cuda_ctx_dst->device) { CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_src->stream())); @@ -4916,6 +4891,14 @@ void ggml_backend_cuda_unregister_host_buffer(void * buffer) { // backend device +struct ggml_backend_cuda_device_context { + int device; + std::string name; + std::string description; + std::string pci_bus_id; + int op_offload_min_batch_size; +}; + static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) { ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context; return ctx->name.c_str(); @@ -5004,11 +4987,6 @@ static bool ggml_backend_cuda_get_available_uma_memory(long * available_memory_k static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context; - -#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) - std::lock_guard lock(ctx->device_mutex); -#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) - ggml_cuda_set_device(ctx->device); CUDA_CHECK(cudaMemGetInfo(free, total)); @@ -5035,13 +5013,6 @@ static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * } #endif // defined(__linux__) -#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) - // If no backends or buffers are active, the cudaMemGetInfo call above lazily created a CUDA - // context that permanently consumes VRAM. Reset the device to free it. - if (ctx->active_count == 0) { - CUDA_CHECK(cudaDeviceReset()); - } -#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) } static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend_dev_t dev) { @@ -5337,15 +5308,24 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g } break; case GGML_OP_REPEAT: { + // the CUDA REPEAT path only implements F32/F16; other types assert at runtime ggml_type src0_type = op->src[0]->type; - return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16; + return src0_type == GGML_TYPE_F32 || src0_type == GGML_TYPE_F16; } break; case GGML_OP_REPEAT_BACK: return op->type == GGML_TYPE_F32 && (op->src[0]->ne[2]*op->src[0]->ne[3]) <= (1 << 15); case GGML_OP_CONCAT: { ggml_type src0_type = op->src[0]->type; - return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16; + ggml_type src1_type = op->src[1]->type; + return src0_type == src1_type && + src0_type == op->type && + !ggml_is_quantized(src0_type) && + ggml_blck_size(src0_type) == 1 && + (ggml_type_size(src0_type) == 1 || + ggml_type_size(src0_type) == 2 || + ggml_type_size(src0_type) == 4 || + ggml_type_size(src0_type) == 8); } break; case GGML_OP_CONV_TRANSPOSE_1D: { @@ -5356,13 +5336,21 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g } return false; } break; + case GGML_OP_COL2IM_1D: + { + ggml_type src0_type = op->src[0]->type; + return (src0_type == GGML_TYPE_F32 || src0_type == GGML_TYPE_F16 || src0_type == GGML_TYPE_BF16) && + op->type == src0_type && + ggml_is_contiguous(op->src[0]) && + ggml_is_contiguous(op); + } break; case GGML_OP_SILU_BACK: return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32; break; case GGML_OP_NORM: case GGML_OP_RMS_NORM: case GGML_OP_L2_NORM: - return true; + return ggml_is_contiguous_rows(op->src[0]); case GGML_OP_RMS_NORM_BACK: return ggml_is_contiguous(op->src[0]); break; @@ -5736,21 +5724,13 @@ ggml_backend_t ggml_backend_cuda_init(int device) { return nullptr; } - ggml_backend_dev_t dev = ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), device); - ggml_backend_t cuda_backend = new ggml_backend { /* .guid = */ ggml_backend_cuda_guid(), /* .iface = */ ggml_backend_cuda_interface, - /* .device = */ dev, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), device), /* .context = */ ctx, }; -#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) - ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context; - std::lock_guard lock(dev_ctx->device_mutex); - dev_ctx->active_count++; -#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) - return cuda_backend; } diff --git a/ggml/src/ggml-cuda/out-prod.cu b/ggml/src/ggml-cuda/out-prod.cu index 499903d09b13..46b9f3a67ee5 100644 --- a/ggml/src/ggml-cuda/out-prod.cu +++ b/ggml/src/ggml-cuda/out-prod.cu @@ -2,6 +2,28 @@ #include +static __global__ void k_compute_out_prod_ptrs( + const float * src0_d, const float * src1_d, float * dst_d, + const float ** ptrs_a, const float ** ptrs_b, float ** ptrs_c, + const int64_t ne2, const int64_t ne3, + const int64_t dps2, const int64_t dps3, + const size_t s02, const size_t s03, + const size_t s12, const size_t s13, + const size_t s2, const size_t s3) { + const int64_t i2 = blockIdx.x*blockDim.x + threadIdx.x; + const int64_t i3 = blockIdx.y*blockDim.y + threadIdx.y; + + if (i2 >= ne2 || i3 >= ne3) { + return; + } + + const int64_t idx = i3*ne2 + i2; + + ptrs_a[idx] = src0_d + (i3/dps3)*s03 + (i2/dps2)*s02; + ptrs_b[idx] = src1_d + i3 *s13 + i2 *s12; + ptrs_c[idx] = dst_d + i3 *s3 + i2 *s2; +} + void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src1 = dst->src[1]; @@ -67,18 +89,39 @@ void ggml_cuda_out_prod(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { &beta, dst_d + i3 *s3, ldc, s2, batch_count)); } + } else if (ne2 > 1 || ne3 > 1) { + // dps2 > 1 (src0 broadcast along dim 2 with non-uniform stride) or multiple GEMMs + // along dim 3: compute per-GEMM pointers on the device and use a single batched GEMM. + GGML_ASSERT(ne3 > 0); + GGML_ASSERT(ne2 <= (int64_t) std::numeric_limits::max() / ne3); + const int batch_count = (int) (ne2 * ne3); + + ggml_cuda_pool_alloc ptrs_a(ctx.pool(), batch_count); + ggml_cuda_pool_alloc ptrs_b(ctx.pool(), batch_count); + ggml_cuda_pool_alloc< float *> ptrs_c(ctx.pool(), batch_count); + + const dim3 block_dims(16, 16); + const dim3 grid_dims((ne2 + block_dims.x - 1)/block_dims.x, (ne3 + block_dims.y - 1)/block_dims.y); + k_compute_out_prod_ptrs<<>>( + src0_d, src1_d, dst_d, + ptrs_a.get(), ptrs_b.get(), ptrs_c.get(), + ne2, ne3, dps2, dps3, s02, s03, s12, s13, s2, s3); + CUDA_CHECK(cudaGetLastError()); + + CUBLAS_CHECK( + cublasSgemmBatched(handle, CUBLAS_OP_N, src1_cublas_op, + ne0, ne1, ne01, + &alpha, ptrs_a.get(), lda, + ptrs_b.get(), ldb, + &beta, ptrs_c.get(), ldc, + batch_count)); } else { - // Fallback: ne2 == 1 (no batching benefit) or dps2 > 1 (src0 broadcast along dim 2 - // with non-uniform stride; would need cublasSgemmBatched with pointer arrays). - for (int64_t i3 = 0; i3 < ne3; ++i3) { - for (int64_t i2 = 0; i2 < ne2; ++i2) { - CUBLAS_CHECK( - cublasSgemm(handle, CUBLAS_OP_N, src1_cublas_op, - ne0, ne1, ne01, - &alpha, src0_d + (i3/dps3)*s03 + (i2/dps2)*s02, lda, - src1_d + i3 *s13 + i2 *s12, ldb, - &beta, dst_d + i3 *s3 + i2 *s2, ldc)); - } - } + // ne2 == 1 && ne3 == 1: single GEMM + CUBLAS_CHECK( + cublasSgemm(handle, CUBLAS_OP_N, src1_cublas_op, + ne0, ne1, ne01, + &alpha, src0_d, lda, + src1_d, ldb, + &beta, dst_d, ldc)); } } diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h index a6115cd80dc4..d01f1533abb6 100644 --- a/ggml/src/ggml-cuda/vendors/hip.h +++ b/ggml/src/ggml-cuda/vendors/hip.h @@ -48,6 +48,7 @@ #define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS #define cublasSetStream hipblasSetStream #define cublasSgemm hipblasSgemm +#define cublasSgemmBatched hipblasSgemmBatched #define cublasSgemmStridedBatched hipblasSgemmStridedBatched #define cublasStatus_t hipblasStatus_t #define cublasOperation_t hipblasOperation_t diff --git a/ggml/src/ggml-cuda/vendors/musa.h b/ggml/src/ggml-cuda/vendors/musa.h index 99e8fa3703e0..6d725c7ec196 100644 --- a/ggml/src/ggml-cuda/vendors/musa.h +++ b/ggml/src/ggml-cuda/vendors/musa.h @@ -32,6 +32,7 @@ #define cublasSetMathMode mublasSetMathMode #define cublasSetStream mublasSetStream #define cublasSgemm mublasSgemm +#define cublasSgemmBatched mublasSgemmBatched #define cublasSgemmStridedBatched mublasSgemmStridedBatched #define cublasStatus_t mublasStatus_t #define cublasOperation_t mublasOperation_t diff --git a/ggml/src/ggml-hexagon/CMakeLists.txt b/ggml/src/ggml-hexagon/CMakeLists.txt index b82bae0c1035..c6e49a71d11b 100644 --- a/ggml/src/ggml-hexagon/CMakeLists.txt +++ b/ggml/src/ggml-hexagon/CMakeLists.txt @@ -25,7 +25,6 @@ include(ExternalProject) option(GGML_HEXAGON_HTP_DEBUG "ggml-hexagon: enable HTP debug output" OFF) option(GGML_HEXAGON_FA_EXP2_HF "ggml-hexagon: use FP16 exp2 polynomial in FA softmax instead of F32 exp round-trip" OFF) set(GGML_HEXAGON_HTP_CERT "$ENV{HEXAGON_HTP_CERT}" CACHE PATH "ggml-hexagon: enable HTP library signing using certificate") -set(GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE 128 CACHE STRING "ggml-hexagon: quantize group size (32, 64, or 128)") add_library(htp_iface OBJECT ${CMAKE_CURRENT_BINARY_DIR}/htp_iface_stub.c) @@ -72,15 +71,12 @@ function(build_htp_skel V) -DHEXAGON_SDK_ROOT=${HEXAGON_SDK_ROOT} -DHEXAGON_TOOLS_ROOT=${HEXAGON_TOOLS_ROOT} -DHEXAGON_HTP_DEBUG=${GGML_HEXAGON_HTP_DEBUG} - -DGGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE=${GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE} -DDSP_VERSION=${V} -DPREBUILT_LIB_DIR="toolv19_${V}") list(APPEND HTP_SKELS ${CMAKE_CURRENT_BINARY_DIR}/libggml-htp-${V}.so) set(HTP_SKELS ${HTP_SKELS} PARENT_SCOPE) endfunction() -build_htp_skel(v68) -build_htp_skel(v69) build_htp_skel(v73) build_htp_skel(v75) build_htp_skel(v79) diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp index 49bd7e4331a2..3d41c47b6515 100644 --- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp +++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #ifdef _WIN32 # include @@ -41,6 +42,7 @@ #include "ggml-quants.h" #include "htp-opnode.h" #include "htp-ops.h" +#include "htp/matmul-ops.h" #include "htp_iface.h" #include "htp-drv.h" @@ -51,7 +53,7 @@ using u32vec = std::vector; static int opt_arch = 0; // autodetect static size_t opt_ndev = 1; static size_t opt_nhvx = 0; // use all -static int opt_use_hmx = 1; // when set, enable HMX; when 0, use HVX only +static int opt_nhmx = 1; // when set, enable HMX; when 0, use HVX only static size_t opt_vmem = HTP_OP_MAX_VMEM_DEFAULT; // max available va space for buffer mappings static size_t opt_mbuf = 1ul * 1024 * 1024 * 1024; // max buffer size static int opt_etm = 0; @@ -59,6 +61,8 @@ static int opt_verbose = 0; static int opt_profile = 0; // profiling mode (0-disabled, 1-basic, 2-pmu) static int opt_hostbuf = 1; // hostbuf ON by default +static int opt_mm_select = 3; // 3 = HMX -> Tiled -> Flat -> CPU, 2 = Tiled -> Flat -> CPU, 1 = Flat -> CPU + // Default PMU events, if profiling with PMU (mode=2) is enabled // See https://docs.qualcomm.com/doc/80-N2040-60/topic/pmu-events.html // https://docs.qualcomm.com/doc/80-N2040-61/topic/hvx-pmu-events.html @@ -68,21 +72,15 @@ static u32vec opt_pmu_evt { 0x3, 0x111, 0x100, 0x105, 0x240, 0x256, 0x7D, 0x8C } static int opt_opstage = HTP_OPSTAGE_QUEUE | HTP_OPSTAGE_COMPUTE; static int opt_opbatch = 1024; // max number of ops in a batch static int opt_opqueue = 16; // max number of pending batches +static int opt_optrace = 0; // trace buffer size per thread (0 means default) static int opt_oppoll = 0; // polling for batch completions +static int opt_opfusion = 1; // enable/disable op fusion static std::regex* opt_opfilter = NULL; // regex of ops to not claim #define HEX_VERBOSE(...) \ if (opt_verbose) GGML_LOG_DEBUG(__VA_ARGS__) -static inline uint64_t hex_is_aligned(void * addr, uint32_t align) { - return ((size_t) addr & (align - 1)) == 0; -} - -static inline size_t hex_round_up(size_t n, size_t m) { - return m * ((n + m - 1) / m); -} - static const char * status_to_str(uint32_t status) { switch (status) { case HTP_STATUS_OK: @@ -106,34 +104,89 @@ static void ggml_hexagon_dump_op_exec(const std::string &sess_name, const htp_op if (!opt_verbose) return; htp_opformat fmt(node); - GGML_LOG_DEBUG("ggml-hex: %s execute-op %s: %s : %s : %s : %s : %s : flags 0x%x\n", sess_name.c_str(), - node.op_name().c_str(), fmt.names, fmt.dims, fmt.types, fmt.strides, fmt.buffs, req_flags); + GGML_LOG_DEBUG("ggml-hex: %s execute-op %s|%s|%s|%s|%s|%s|%s|flags 0x%x\n", sess_name.c_str(), + node.op_name().c_str(), fmt.names, fmt.dims, fmt.types, fmt.strides, fmt.buffs, fmt.kparams, req_flags); } static void ggml_hexagon_dump_op_supp(const std::string &sess_name, const struct ggml_tensor * op, bool supp) { if (!opt_verbose) return; htp_opformat fmt(htp_opformat(htp_opnode{const_cast(op), {}, HTP_OP_INVALID})); - GGML_LOG_DEBUG("ggml-hex: %s supports-op %s: %s : %s : %s : %s : %s : %s\n", sess_name.c_str(), + GGML_LOG_DEBUG("ggml-hex: %s supports-op %s|%s|%s|%s|%s|%s|%s\n", sess_name.c_str(), ggml_op_desc(op), fmt.names, fmt.dims, fmt.types, fmt.strides, fmt.buffs, supp ? "yes" : "no"); } +static const char * htp_event_name(uint16_t id) { + switch (id) { + case HTP_TRACE_EVT_DMA: return "DMA"; + case HTP_TRACE_EVT_HVX_COMP: return "HVX_COMP"; + case HTP_TRACE_EVT_HVX_A_QUANT: return "HVX_A_QUANT"; + case HTP_TRACE_EVT_HVX_A_PREP: return "HVX_A_PREP"; + case HTP_TRACE_EVT_HVX_W_DEQUANT: return "HVX_W_DEQUANT"; + case HTP_TRACE_EVT_HVX_W_PREP: return "HVX_W_PREP"; + case HTP_TRACE_EVT_HVX_O_PROC: return "HVX_O_PROC"; + case HTP_TRACE_EVT_HMX_COMP: return "HMX_COMP"; + default: return "UNKNOWN"; + } +} + static void ggml_hexagon_dump_op_prof(const std::string &sess_name, const htp_opnode & node, - uint32_t op_usec, uint32_t op_cycles, const uint32_t pmu[]) { + const htp_prof_desc & pd) { if (!opt_profile) return; + uint32_t op_usec = pd.usecs; + uint32_t op_cycles = pd.cycles_stop - pd.cycles_start; + const uint32_t * pmu = pd.pmu; + char pmu_str[256] = ""; - if (opt_profile > 1) { + if (opt_profile == 2) { static_assert(HTP_PROF_PMU_NCNT == 8, "current implementation assumes 8 PMU counters"); - sprintf(pmu_str, " pmu [%u,%u,%u,%u,%u,%u,%u,%u]", + snprintf(pmu_str, sizeof(pmu_str), " pmu [%u,%u,%u,%u,%u,%u,%u,%u]", pmu[0], pmu[1], pmu[2], pmu[3], pmu[4], pmu[5], pmu[6], pmu[7]); } htp_opformat fmt(node); - GGML_LOG_DEBUG("ggml-hex: %s profile-op %s: %s : %s : %s : %s : usec %u cycles %u%s\n", sess_name.c_str(), - node.op_name().c_str(), fmt.names, fmt.dims, fmt.types, fmt.strides, op_usec, op_cycles, pmu_str); + float mhz = op_usec > 0 ? (float) op_cycles / op_usec : 0.0f; + GGML_LOG_DEBUG("ggml-hex: %s profile-op %s|%s|%s|%s|%s|%s|usec %u cycles %u start %u mhz %.1f%s\n", sess_name.c_str(), + node.op_name().c_str(), fmt.names, fmt.dims, fmt.types, fmt.strides, fmt.kparams, op_usec, op_cycles, pd.cycles_start, mhz, pmu_str); +} + +// ** + +static inline bool ggml_hexagon_is_repack_type(enum ggml_type type) { + return type == GGML_TYPE_Q4_0 || type == GGML_TYPE_Q4_1 || + type == GGML_TYPE_Q8_0 || type == GGML_TYPE_IQ4_NL || + type == GGML_TYPE_MXFP4; +} + +static inline bool ggml_hexagon_is_hmx_weight_type(enum ggml_type type) { + return type == GGML_TYPE_F16 || type == GGML_TYPE_F32 || ggml_hexagon_is_repack_type(type); } +struct htp_mm_kernel_params; +struct ggml_hexagon_session; +static void ggml_hexagon_precompute_matmul_params( + const struct ggml_hexagon_session * sess, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + const struct ggml_tensor * dst, + struct htp_mm_kernel_params * kparams +); + +static void ggml_hexagon_precompute_fused_qkv_params( + const struct ggml_hexagon_session * sess, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct htp_mm_kernel_params * kparams +); + +static void ggml_hexagon_precompute_fused_ffn_params( + const struct ggml_hexagon_session * sess, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct htp_mm_kernel_params * kparams +); + // ** backend sessions struct ggml_hexagon_opbatch; @@ -160,6 +213,18 @@ struct ggml_hexagon_session { ggml_backend_buffer_type buffer_type = {}; ggml_backend_buffer_type repack_buffer_type = {}; + uint32_t n_threads = 0; + uint32_t n_hvx = 0; + uint32_t n_hmx = 0; + uint64_t vtcm_size = 0; + size_t max_vmem = 0; + size_t max_bufsize = 0; + + struct { + uint64_t uid = 0; + std::vector htp_nodes; + } cached_graph; + ggml_hexagon_session(int dev_id, ggml_backend_dev_t dev) noexcept(false); ~ggml_hexagon_session() noexcept(true); @@ -305,47 +370,7 @@ static enum ggml_status ggml_backend_hexagon_buffer_init_tensor(ggml_backend_buf return GGML_STATUS_SUCCESS; } -// ======== Q4x4x2 ==================== -struct x2_q4 { - int v[2]; -}; - -static x2_q4 unpack_q4(uint8_t v) { - x2_q4 x = { (int) (v & 0x0f) - 8, (int) (v >> 4) - 8 }; - return x; -} - -static void dump_block_q4_0(const block_q4_0 * b, int i) { - HEX_VERBOSE("ggml-hex: repack q4_0 %d: %d %d %d %d ... %d %d %d %d : %.6f\n", i, unpack_q4(b->qs[0]).v[0], - unpack_q4(b->qs[1]).v[0], unpack_q4(b->qs[2]).v[0], unpack_q4(b->qs[3]).v[0], unpack_q4(b->qs[12]).v[1], - unpack_q4(b->qs[13]).v[1], unpack_q4(b->qs[14]).v[1], unpack_q4(b->qs[15]).v[1], - GGML_FP16_TO_FP32(b->d)); -} - -static void dump_packed_block_q4x4x2(const uint8_t * v, unsigned int i, size_t k) { - static const int qk = QK_Q4_0x4x2; - const int dblk_size = 8 * 2; // 8x __fp16 - const int qblk_size = qk / 2; // int4 - const int qrow_size = k / 2; // int4 (not padded) - - const uint8_t * v_q = v + 0; // quants first - const uint8_t * v_d = v + qrow_size; // then scales - - const uint8_t * q = v_q + i * qblk_size; - const ggml_half * d = (const ggml_half *) (v_d + i * dblk_size); - - HEX_VERBOSE("ggml-hex: repack q4x4x2-%d: %d %d %d %d ... %d %d %d %d ... %d %d %d %d : %.6f %.6f %.6f %.6f\n", i, - unpack_q4(q[0]).v[0], unpack_q4(q[1]).v[0], unpack_q4(q[2]).v[0], unpack_q4(q[3]).v[0], - unpack_q4(q[60]).v[0], unpack_q4(q[61]).v[0], unpack_q4(q[62]).v[0], unpack_q4(q[63]).v[0], - unpack_q4(q[124]).v[0], unpack_q4(q[125]).v[0], unpack_q4(q[126]).v[0], unpack_q4(q[127]).v[0], - GGML_FP16_TO_FP32(d[0]), GGML_FP16_TO_FP32(d[1]), GGML_FP16_TO_FP32(d[2]), GGML_FP16_TO_FP32(d[3])); - - HEX_VERBOSE("ggml-hex: repack q4x4x2-%d: %d %d %d %d ... %d %d %d %d ... %d %d %d %d : %.6f %.6f %.6f %.6f\n", - i + 1, unpack_q4(q[0]).v[1], unpack_q4(q[1]).v[1], unpack_q4(q[2]).v[1], unpack_q4(q[3]).v[1], - unpack_q4(q[60]).v[1], unpack_q4(q[61]).v[1], unpack_q4(q[62]).v[1], unpack_q4(q[63]).v[1], - unpack_q4(q[124]).v[1], unpack_q4(q[125]).v[1], unpack_q4(q[126]).v[1], unpack_q4(q[127]).v[1], - GGML_FP16_TO_FP32(d[4]), GGML_FP16_TO_FP32(d[5]), GGML_FP16_TO_FP32(d[6]), GGML_FP16_TO_FP32(d[7])); -} +// ** Repack helpers for tiled quantized weights static void unpack_q4_0_quants(uint8_t * qs, const block_q4_0 * x, unsigned int bi) { static const int qk = QK4_0; @@ -368,300 +393,6 @@ static void pack_q4_0_quants(block_q4_0 * x, const uint8_t * qs, unsigned int bi } } -static void repack_row_q4x4x2(uint8_t * y, const block_q4_0 * x, int64_t k) { - static const int qk = QK_Q4_0x4x2; - const int nb = (k + qk - 1) / qk; // number of blocks (padded) - const int nloe = k % qk; // leftovers - - const int dblk_size = 8 * 2; // 8x __fp16 - const int qblk_size = qk / 2; // int4 - const int qrow_size = k / 2; // int4 (not padded to blocks) - - uint8_t * y_q = y + 0; // quants first - uint8_t * y_d = y + qrow_size; // then scales - - if (opt_verbose > 2) { - for (int i = 0; i < nb; i++) { - dump_block_q4_0(&x[i * 8 + 0], 0); - dump_block_q4_0(&x[i * 8 + 1], 1); - dump_block_q4_0(&x[i * 8 + 2], 2); - dump_block_q4_0(&x[i * 8 + 3], 3); - dump_block_q4_0(&x[i * 8 + 4], 4); - dump_block_q4_0(&x[i * 8 + 5], 5); - dump_block_q4_0(&x[i * 8 + 6], 6); - dump_block_q4_0(&x[i * 8 + 7], 7); - } - } - - // Repack the quants - for (int i = 0; i < nb; i++) { - uint8_t qs[QK_Q4_0x4x2]; // unpacked quants - unpack_q4_0_quants(qs, &x[i * 8 + 0], 0); - unpack_q4_0_quants(qs, &x[i * 8 + 1], 1); - unpack_q4_0_quants(qs, &x[i * 8 + 2], 2); - unpack_q4_0_quants(qs, &x[i * 8 + 3], 3); - unpack_q4_0_quants(qs, &x[i * 8 + 4], 4); - unpack_q4_0_quants(qs, &x[i * 8 + 5], 5); - unpack_q4_0_quants(qs, &x[i * 8 + 6], 6); - unpack_q4_0_quants(qs, &x[i * 8 + 7], 7); - - bool partial = (nloe && i == nb-1); - - uint8_t * q = y_q + (i * qblk_size); - for (int j = 0; j < qk / 2; j++) { - q[j] = partial ? (qs[j*2+1] << 4) | qs[j*2+0] : (qs[j+128] << 4) | qs[j+000]; - } - } - - // Repack the scales - // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2) - // the last block is truncated and overridden by the scales. - for (int i = 0; i < nb; i++) { - // Repack the scales - ggml_half * d = (ggml_half *) (y_d + i * dblk_size); - d[0] = x[i * 8 + 0].d; - d[1] = x[i * 8 + 1].d; - d[2] = x[i * 8 + 2].d; - d[3] = x[i * 8 + 3].d; - d[4] = x[i * 8 + 4].d; - d[5] = x[i * 8 + 5].d; - d[6] = x[i * 8 + 6].d; - d[7] = x[i * 8 + 7].d; - } - - if (opt_verbose > 2) { - for (int i = 0; i < nb; i++) { - dump_packed_block_q4x4x2(y, i, k); - } - } -} - -static void unpack_row_q4x4x2(block_q4_0 * x, const uint8_t * y, int64_t k) { - static const int qk = QK_Q4_0x4x2; - const int nb = (k + qk - 1) / qk; // number of blocks (padded) - const int nloe = k % qk; // leftovers - - const int dblk_size = 8 * 2; // 8x __fp16 - const int qblk_size = qk / 2; // int4 - const int qrow_size = k / 2; // int4 (not padded to blocks) - - const uint8_t * y_q = y + 0; // quants first - const uint8_t * y_d = y + qrow_size; // then scales - - if (opt_verbose > 2) { - for (int i = 0; i < nb; i++) { - dump_packed_block_q4x4x2(y, i, k); - } - } - - // Unpack the quants - for (int i = 0; i < nb; i++) { - uint8_t qs[QK_Q4_0x4x2]; // unpacked quants - - bool partial = (nloe && i == nb-1); - - const uint8_t * q = y_q + (i * qblk_size); - for (int j = 0; j < qk / 2; j++) { - if (partial) { - qs[j*2+0] = q[j] & 0xf; - qs[j*2+1] = q[j] >> 4; - } else { - qs[j+000] = q[j] & 0xf; - qs[j+128] = q[j] >> 4; - } - } - - pack_q4_0_quants(&x[i * 8 + 0], qs, 0); - pack_q4_0_quants(&x[i * 8 + 1], qs, 1); - pack_q4_0_quants(&x[i * 8 + 2], qs, 2); - pack_q4_0_quants(&x[i * 8 + 3], qs, 3); - pack_q4_0_quants(&x[i * 8 + 4], qs, 4); - pack_q4_0_quants(&x[i * 8 + 5], qs, 5); - pack_q4_0_quants(&x[i * 8 + 6], qs, 6); - pack_q4_0_quants(&x[i * 8 + 7], qs, 7); - } - - // Repack the scales - // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2) - // the last block is truncated and overridden by the scales. - for (int i = 0; i < nb; i++) { - // Unpack the scales - const ggml_half * d = (const ggml_half *) (y_d + i * dblk_size); - x[i * 8 + 0].d = d[0]; - x[i * 8 + 1].d = d[1]; - x[i * 8 + 2].d = d[2]; - x[i * 8 + 3].d = d[3]; - x[i * 8 + 4].d = d[4]; - x[i * 8 + 5].d = d[5]; - x[i * 8 + 6].d = d[6]; - x[i * 8 + 7].d = d[7]; - } - - if (opt_verbose > 2) { - for (int i = 0; i < nb; i++) { - dump_block_q4_0(&x[i * 8 + 0], 0); - dump_block_q4_0(&x[i * 8 + 1], 1); - dump_block_q4_0(&x[i * 8 + 2], 2); - dump_block_q4_0(&x[i * 8 + 3], 3); - dump_block_q4_0(&x[i * 8 + 4], 4); - dump_block_q4_0(&x[i * 8 + 5], 5); - dump_block_q4_0(&x[i * 8 + 6], 6); - dump_block_q4_0(&x[i * 8 + 7], 7); - } - } -} - -static void init_row_q4x4x2(block_q4_0 * x, int64_t k) { - static const int qk = QK_Q4_0x4x2; - const int nb = (k + qk - 1) / qk; // number of blocks (padded) - - // Init the quants such that they unpack into zeros - uint8_t qs[QK_Q4_0x4x2]; // unpacked quants - memset(qs, 8, sizeof(qs)); - - for (int i = 0; i < nb; i++) { - pack_q4_0_quants(&x[i * 8 + 0], qs, 0); - pack_q4_0_quants(&x[i * 8 + 1], qs, 1); - pack_q4_0_quants(&x[i * 8 + 2], qs, 2); - pack_q4_0_quants(&x[i * 8 + 3], qs, 3); - pack_q4_0_quants(&x[i * 8 + 4], qs, 4); - pack_q4_0_quants(&x[i * 8 + 5], qs, 5); - pack_q4_0_quants(&x[i * 8 + 6], qs, 6); - pack_q4_0_quants(&x[i * 8 + 7], qs, 7); - } - - // Init the scales - // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2) - // the last block is truncated and overridden by the scales. - for (int i = 0; i < nb; i++) { - // Unpack the scales - x[i * 8 + 0].d = 0; - x[i * 8 + 1].d = 0; - x[i * 8 + 2].d = 0; - x[i * 8 + 3].d = 0; - x[i * 8 + 4].d = 0; - x[i * 8 + 5].d = 0; - x[i * 8 + 6].d = 0; - x[i * 8 + 7].d = 0; - } -} - -// repack q4_0 data into q4x4x2 tensor -static void repack_q4_0_q4x4x2(ggml_tensor * t, const void * data, size_t size) { - int64_t nrows = ggml_nrows(t); - - size_t row_size = ggml_row_size(t->type, t->ne[0]); - size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2)); // extra elements for the pad - size_t row_size_rp = row_size_pd; // scratch must hold one full padded tile (qblk_size/2 quants + scales) - - // Ensure we don't try to read more data than is available in the source buffer 'data' - // or write more than the tensor can hold. - const size_t total_tensor_size = (size_t)nrows * row_size; - const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size; - - // Calculate how many full rows and how many remaining bytes we need to process. - const int64_t n_full_rows = n_bytes_to_copy / row_size; - const size_t n_rem_bytes = n_bytes_to_copy % row_size; - - void * buf_pd = ggml_aligned_malloc(row_size_pd); - GGML_ASSERT(buf_pd != NULL); - - void * buf_rp = ggml_aligned_malloc(row_size_rp); - GGML_ASSERT(buf_rp != NULL); - - HEX_VERBOSE("ggml-hex: repack-q4_0-q4x4x2 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data, size, - t->ne[0], nrows, row_size); - - init_row_q4x4x2((block_q4_0 *) buf_pd, t->ne[0]); // init padded buffer to make sure the tail is all zeros - - // 1. Process all the full rows - for (int64_t i = 0; i < n_full_rows; i++) { - const uint8_t * src = (const uint8_t *) data + (i * row_size); - uint8_t * dst = (uint8_t *) t->data + (i * row_size); - - memcpy(buf_pd, src, row_size); - repack_row_q4x4x2((uint8_t *) buf_rp, (const block_q4_0 *) buf_pd, t->ne[0]); - memcpy(dst, buf_rp, row_size); - } - - // 2. Process the final, potentially partial, row - if (n_rem_bytes > 0) { - const int64_t i = n_full_rows; - const uint8_t * src = (const uint8_t *) data + (i * row_size); - uint8_t * dst = (uint8_t *) t->data + (i * row_size); - - // re-init the row because we are potentially copying a partial row - init_row_q4x4x2((block_q4_0 *) buf_pd, t->ne[0]); - - // Copy only the remaining bytes from the source. - memcpy(buf_pd, src, n_rem_bytes); - - // Repack the entire buffer - repack_row_q4x4x2((uint8_t *) buf_rp, (const block_q4_0 *) buf_pd, t->ne[0]); - - // Write only the corresponding remaining bytes to the destination tensor. - memcpy(dst, buf_rp, n_rem_bytes); - } - - ggml_aligned_free(buf_pd, row_size_pd); - ggml_aligned_free(buf_rp, row_size_rp); -} - -// repack q4x4x2 tensor into q4_0 data -static void repack_q4x4x2_q4_0(void * data, const ggml_tensor * t, size_t size) { - int64_t nrows = ggml_nrows(t); - - size_t row_size = ggml_row_size(t->type, t->ne[0]); - size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2)); // extra elements for the pad - size_t row_size_rp = row_size_pd; // scratch must hold one full padded tile (qblk_size/2 quants + scales) - - // Ensure we don't try to copy more data than the tensor actually contains. - const size_t total_tensor_size = (size_t)nrows * row_size; - const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size; - - // Calculate how many full rows and how many remaining bytes we need to process. - const int64_t n_full_rows = n_bytes_to_copy / row_size; - const size_t n_rem_bytes = n_bytes_to_copy % row_size; - - void * buf_pd = ggml_aligned_malloc(row_size_pd); - GGML_ASSERT(buf_pd != NULL); - - void * buf_rp = ggml_aligned_malloc(row_size_rp); - GGML_ASSERT(buf_rp != NULL); - - HEX_VERBOSE("ggml-hex: repack-q4x4x2-q4_0 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data, size, - t->ne[0], nrows, row_size); - - memset(buf_pd, 0, row_size_pd); // clear-out padded buffer to make sure the tail is all zeros - - // 1. Process all the full rows - for (int64_t i = 0; i < n_full_rows; i++) { - const uint8_t * src = (const uint8_t *) t->data + (i * row_size); - uint8_t * dst = (uint8_t *) data + (i * row_size); - - memcpy(buf_pd, src, row_size); - unpack_row_q4x4x2((block_q4_0 *) buf_rp, (const uint8_t *) buf_pd, t->ne[0]); - memcpy(dst, buf_rp, row_size); - } - - // 2. Process the final, potentially partial, row - if (n_rem_bytes > 0) { - const int64_t i = n_full_rows; - const uint8_t * src = (const uint8_t *) t->data + (i * row_size); - uint8_t * dst = (uint8_t *) data + (i * row_size); - - // We still need to read and unpack the entire source row because quantization is block-based. - memcpy(buf_pd, src, row_size); - unpack_row_q4x4x2((block_q4_0 *) buf_rp, (const uint8_t *) buf_pd, t->ne[0]); - - // But we only copy the remaining number of bytes to the destination. - memcpy(dst, buf_rp, n_rem_bytes); - } - - ggml_aligned_free(buf_pd, row_size_pd); - ggml_aligned_free(buf_rp, row_size_rp); -} - static void unpack_q4_1_quants(uint8_t * qs, const block_q4_1 * x, unsigned int bi) { static const int qk = QK4_1; @@ -683,603 +414,19 @@ static void pack_q4_1_quants(block_q4_1 * x, const uint8_t * qs, unsigned int bi } } -static void repack_row_q4_1x4x2(uint8_t * y, const block_q4_1 * x, int64_t k) { - static const int qk = QK_Q4_0x4x2; - const int nb = (k + qk - 1) / qk; // number of blocks (padded) - const int nloe = k % qk; // leftovers - - const int dblk_size = 8 * 4; // 8x (d, m) __fp16 = 32 bytes - const int qblk_size = qk / 2; // int4 = 128 bytes - const int qrow_size = k / 2; // int4 (not padded to blocks) - - uint8_t * y_q = y + 0; // quants first - uint8_t * y_d = y + qrow_size; // then scales/offsets - - // Repack the quants - for (int i = 0; i < nb; i++) { - uint8_t qs[QK_Q4_0x4x2]; // unpacked quants - unpack_q4_1_quants(qs, &x[i * 8 + 0], 0); - unpack_q4_1_quants(qs, &x[i * 8 + 1], 1); - unpack_q4_1_quants(qs, &x[i * 8 + 2], 2); - unpack_q4_1_quants(qs, &x[i * 8 + 3], 3); - unpack_q4_1_quants(qs, &x[i * 8 + 4], 4); - unpack_q4_1_quants(qs, &x[i * 8 + 5], 5); - unpack_q4_1_quants(qs, &x[i * 8 + 6], 6); - unpack_q4_1_quants(qs, &x[i * 8 + 7], 7); - - bool partial = (nloe && i == nb-1); - - uint8_t * q = y_q + (i * qblk_size); - for (int j = 0; j < qk / 2; j++) { - q[j] = partial ? (qs[j*2+1] << 4) | qs[j*2+0] : (qs[j+128] << 4) | qs[j+000]; - } - } - - // Repack the scales and offsets - for (int i = 0; i < nb; i++) { - ggml_half * d_m = (ggml_half *) (y_d + i * dblk_size); - for (int j = 0; j < 8; j++) { - d_m[j * 2 + 0] = x[i * 8 + j].d; - d_m[j * 2 + 1] = x[i * 8 + j].m; - } - } -} - -static void unpack_row_q4_1x4x2(block_q4_1 * x, const uint8_t * y, int64_t k) { - static const int qk = QK_Q4_0x4x2; - const int nb = (k + qk - 1) / qk; // number of blocks (padded) - const int nloe = k % qk; // leftovers - - const int dblk_size = 8 * 4; // 8x (d, m) __fp16 = 32 bytes - const int qblk_size = qk / 2; // int4 = 128 bytes - const int qrow_size = k / 2; // int4 (not padded to blocks) - - const uint8_t * y_q = y + 0; // quants first - const uint8_t * y_d = y + qrow_size; // then scales/offsets - - // Unpack the quants - for (int i = 0; i < nb; i++) { - uint8_t qs[QK_Q4_0x4x2]; - bool partial = (nloe && i == nb-1); - - const uint8_t * q = y_q + (i * qblk_size); - for (int j = 0; j < qk / 2; j++) { - if (partial) { - qs[j*2+0] = q[j] & 0x0F; - qs[j*2+1] = q[j] >> 4; - } else { - qs[j+000] = q[j] & 0x0F; - qs[j+128] = q[j] >> 4; - } - } - - pack_q4_1_quants(&x[i * 8 + 0], qs, 0); - pack_q4_1_quants(&x[i * 8 + 1], qs, 1); - pack_q4_1_quants(&x[i * 8 + 2], qs, 2); - pack_q4_1_quants(&x[i * 8 + 3], qs, 3); - pack_q4_1_quants(&x[i * 8 + 4], qs, 4); - pack_q4_1_quants(&x[i * 8 + 5], qs, 5); - pack_q4_1_quants(&x[i * 8 + 6], qs, 6); - pack_q4_1_quants(&x[i * 8 + 7], qs, 7); - } - - // Unpack the scales and offsets - for (int i = 0; i < nb; i++) { - const ggml_half * d_m = (const ggml_half *) (y_d + i * dblk_size); - for (int j = 0; j < 8; j++) { - x[i * 8 + j].d = d_m[j * 2 + 0]; - x[i * 8 + j].m = d_m[j * 2 + 1]; - } - } -} - -static void init_row_q4_1x4x2(block_q4_1 * x, int64_t k) { - static const int qk = QK_Q4_0x4x2; - const int nb = (k + qk - 1) / qk; // number of blocks (padded) - - uint8_t qs[QK_Q4_0x4x2]; // unpacked quants - memset(qs, 0, sizeof(qs)); - - for (int i = 0; i < nb; i++) { - pack_q4_1_quants(&x[i * 8 + 0], qs, 0); - pack_q4_1_quants(&x[i * 8 + 1], qs, 1); - pack_q4_1_quants(&x[i * 8 + 2], qs, 2); - pack_q4_1_quants(&x[i * 8 + 3], qs, 3); - pack_q4_1_quants(&x[i * 8 + 4], qs, 4); - pack_q4_1_quants(&x[i * 8 + 5], qs, 5); - pack_q4_1_quants(&x[i * 8 + 6], qs, 6); - pack_q4_1_quants(&x[i * 8 + 7], qs, 7); - } - - for (int i = 0; i < nb; i++) { - for (int j = 0; j < 8; j++) { - x[i * 8 + j].d = 0; - x[i * 8 + j].m = 0; - } - } -} - -static void repack_q4_1_q4x4x2(ggml_tensor * t, const void * data, size_t size) { - int64_t nrows = ggml_nrows(t); - - size_t row_size = ggml_row_size(t->type, t->ne[0]); - size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2)); - size_t row_size_rp = row_size_pd; // scratch must hold one full padded tile (qblk_size/2 quants + scales) - - const size_t total_tensor_size = (size_t)nrows * row_size; - const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size; - - const int64_t n_full_rows = n_bytes_to_copy / row_size; - const size_t n_rem_bytes = n_bytes_to_copy % row_size; - - void * buf_pd = ggml_aligned_malloc(row_size_pd); - GGML_ASSERT(buf_pd != NULL); - - void * buf_rp = ggml_aligned_malloc(row_size_rp); - GGML_ASSERT(buf_rp != NULL); - - HEX_VERBOSE("ggml-hex: repack-q4_1-q4x4x2 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data, size, - t->ne[0], nrows, row_size); - - init_row_q4_1x4x2((block_q4_1 *) buf_pd, t->ne[0]); - - for (int64_t i = 0; i < n_full_rows; i++) { - const uint8_t * src = (const uint8_t *) data + (i * row_size); - uint8_t * dst = (uint8_t *) t->data + (i * row_size); - - memcpy(buf_pd, src, row_size); - repack_row_q4_1x4x2((uint8_t *) buf_rp, (const block_q4_1 *) buf_pd, t->ne[0]); - memcpy(dst, buf_rp, row_size); - } - - if (n_rem_bytes > 0) { - const int64_t i = n_full_rows; - const uint8_t * src = (const uint8_t *) data + (i * row_size); - uint8_t * dst = (uint8_t *) t->data + (i * row_size); - - init_row_q4_1x4x2((block_q4_1 *) buf_pd, t->ne[0]); - memcpy(buf_pd, src, n_rem_bytes); - repack_row_q4_1x4x2((uint8_t *) buf_rp, (const block_q4_1 *) buf_pd, t->ne[0]); - memcpy(dst, buf_rp, n_rem_bytes); - } - - ggml_aligned_free(buf_pd, row_size_pd); - ggml_aligned_free(buf_rp, row_size_rp); -} - -static void repack_q4x4x2_q4_1(void * data, const ggml_tensor * t, size_t size) { - int64_t nrows = ggml_nrows(t); - - size_t row_size = ggml_row_size(t->type, t->ne[0]); - size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q4_0x4x2)); - size_t row_size_rp = row_size_pd; // scratch must hold one full padded tile (qblk_size/2 quants + scales) - - const size_t total_tensor_size = (size_t)nrows * row_size; - const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size; - - const int64_t n_full_rows = n_bytes_to_copy / row_size; - const size_t n_rem_bytes = n_bytes_to_copy % row_size; - - void * buf_pd = ggml_aligned_malloc(row_size_pd); - GGML_ASSERT(buf_pd != NULL); - - void * buf_rp = ggml_aligned_malloc(row_size_rp); - GGML_ASSERT(buf_rp != NULL); - - HEX_VERBOSE("ggml-hex: repack-q4x4x2-q4_1 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data, size, - t->ne[0], nrows, row_size); - - memset(buf_rp, 0, row_size_rp); // clear-out padded buffer to make sure the tail is all zeros - - for (int64_t i = 0; i < n_full_rows; i++) { - const uint8_t * src = (const uint8_t *) t->data + (i * row_size); - uint8_t * dst = (uint8_t *) data + (i * row_size); - - memcpy(buf_rp, src, row_size); - unpack_row_q4_1x4x2((block_q4_1 *) buf_pd, (const uint8_t *) buf_rp, t->ne[0]); - memcpy(dst, buf_pd, row_size); - } - - if (n_rem_bytes > 0) { - const int64_t i = n_full_rows; - const uint8_t * src = (const uint8_t *) t->data + (i * row_size); - uint8_t * dst = (uint8_t *) data + (i * row_size); - - // We still need to read and unpack the entire source row because quantization is block-based. - memcpy(buf_rp, src, row_size); - unpack_row_q4_1x4x2((block_q4_1 *) buf_pd, (const uint8_t *) buf_rp, t->ne[0]); - memcpy(dst, buf_pd, n_rem_bytes); - } - - ggml_aligned_free(buf_pd, row_size_pd); - ggml_aligned_free(buf_rp, row_size_rp); -} - -// ======== Q8x4x2 ==================== -static void dump_block_q8_0(const block_q8_0 * b, int i) { - HEX_VERBOSE("ggml-hex: repack q8_0 %d: %d %d %d %d ... %d %d %d %d : %.6f\n", i, b->qs[0], b->qs[1], b->qs[2], - b->qs[3], b->qs[28], b->qs[29], b->qs[30], b->qs[31], GGML_FP16_TO_FP32(b->d)); -} - -static void dump_packed_block_q8x4x2(const uint8_t * v, unsigned int i, size_t k) { - static const int qk = QK_Q8_0x4x2; - const int dblk_size = 8 * 2; // 8x __fp16 - const int qblk_size = qk; // int8 - const int qrow_size = k; // int8 (not padded) - - const uint8_t * v_q = v + 0; // quants first - const uint8_t * v_d = v + qrow_size; // then scales - - const uint8_t * q = v_q + i * qblk_size; - const ggml_half * d = (const ggml_half *) (v_d + i * dblk_size); - - HEX_VERBOSE("ggml-hex: repack q8x4x2-%d: %d %d %d %d ... %d %d %d %d ... %d %d %d %d : %.6f %.6f %.6f %.6f\n", i, - q[0], q[1], q[2], q[3], q[60], q[61], q[62], q[63], q[124], q[125], q[126], q[127], - GGML_FP16_TO_FP32(d[0]), GGML_FP16_TO_FP32(d[1]), GGML_FP16_TO_FP32(d[2]), GGML_FP16_TO_FP32(d[3])); - - HEX_VERBOSE("ggml-hex: repack q8x4x2-%d: %d %d %d %d ... %d %d %d %d ... %d %d %d %d : %.6f %.6f %.6f %.6f\n", - i + 1, q[128], q[129], q[130], q[131], q[192], q[193], q[194], q[195], q[252], q[253], q[254], q[255], - GGML_FP16_TO_FP32(d[4]), GGML_FP16_TO_FP32(d[5]), GGML_FP16_TO_FP32(d[6]), GGML_FP16_TO_FP32(d[7])); -} - -static void unpack_q8_0_quants(uint8_t * qs, const block_q8_0 * x, unsigned int bi) { - static const int qk = QK8_0; - - for (unsigned int i = 0; i < qk; ++i) { - qs[bi * qk + i] = x->qs[i]; - } -} - -static void pack_q8_0_quants(block_q8_0 * x, const uint8_t * qs, unsigned int bi) { - static const int qk = QK8_0; - - for (unsigned int i = 0; i < qk; ++i) { - x->qs[i] = qs[bi * qk + i]; - } -} - -static void repack_row_q8x4x2(uint8_t * y, const block_q8_0 * x, int64_t k) { - static const int qk = QK_Q8_0x4x2; - const int nb = (k + qk - 1) / qk; // number of blocks (padded) - - const int dblk_size = 8 * 2; // 8x __fp16 - const int qblk_size = qk; // int8 - const int qrow_size = k; // int8 (not padded to blocks) - - uint8_t * y_q = y + 0; // quants first - uint8_t * y_d = y + qrow_size; // then scales - - if (opt_verbose > 2) { - for (int i = 0; i < nb; i++) { - dump_block_q8_0(&x[i * 8 + 0], 0); - dump_block_q8_0(&x[i * 8 + 1], 1); - dump_block_q8_0(&x[i * 8 + 2], 2); - dump_block_q8_0(&x[i * 8 + 3], 3); - dump_block_q8_0(&x[i * 8 + 4], 4); - dump_block_q8_0(&x[i * 8 + 5], 5); - dump_block_q8_0(&x[i * 8 + 6], 6); - dump_block_q8_0(&x[i * 8 + 7], 7); - } - } - - // Repack the quants - for (int i = 0; i < nb; i++) { - uint8_t qs[QK_Q8_0x4x2]; // unpacked quants - - unpack_q8_0_quants(qs, &x[i * 8 + 0], 0); - unpack_q8_0_quants(qs, &x[i * 8 + 1], 1); - unpack_q8_0_quants(qs, &x[i * 8 + 2], 2); - unpack_q8_0_quants(qs, &x[i * 8 + 3], 3); - unpack_q8_0_quants(qs, &x[i * 8 + 4], 4); - unpack_q8_0_quants(qs, &x[i * 8 + 5], 5); - unpack_q8_0_quants(qs, &x[i * 8 + 6], 6); - unpack_q8_0_quants(qs, &x[i * 8 + 7], 7); - - uint8_t * q = y_q + (i * qblk_size); - for (int j = 0; j < qk; j++) { - q[j] = qs[j]; - } - } - - // Repack the scales - // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2) - // the last block is truncated and overridden by the scales. - for (int i = 0; i < nb; i++) { - // Repack the scales - ggml_half * d = (ggml_half *) (y_d + i * dblk_size); - d[0] = x[i * 8 + 0].d; - d[1] = x[i * 8 + 1].d; - d[2] = x[i * 8 + 2].d; - d[3] = x[i * 8 + 3].d; - d[4] = x[i * 8 + 4].d; - d[5] = x[i * 8 + 5].d; - d[6] = x[i * 8 + 6].d; - d[7] = x[i * 8 + 7].d; - } - - if (opt_verbose > 2) { - for (int i = 0; i < nb; i++) { - dump_packed_block_q8x4x2(y, i, k); - } - } -} - -static void unpack_row_q8x4x2(block_q8_0 * x, const uint8_t * y, int64_t k) { - static const int qk = QK_Q8_0x4x2; - const int nb = (k + qk - 1) / qk; // number of blocks (padded) - - const int dblk_size = 8 * 2; // 8x __fp16 - const int qblk_size = qk; // int8 - const int qrow_size = k; // int8 (not padded to blocks) - - const uint8_t * y_q = y + 0; // quants first - const uint8_t * y_d = y + qrow_size; // then scales - - if (opt_verbose > 2) { - for (int i = 0; i < nb; i++) { - dump_packed_block_q8x4x2(y, i, k); - } - } - - // Unpack the quants - for (int i = 0; i < nb; i++) { - uint8_t qs[QK_Q4_0x4x2]; // unpacked quants - - const uint8_t * q = y_q + (i * qblk_size); - for (int j = 0; j < qk; j++) { - qs[j] = q[j]; - } - - pack_q8_0_quants(&x[i * 8 + 0], qs, 0); - pack_q8_0_quants(&x[i * 8 + 1], qs, 1); - pack_q8_0_quants(&x[i * 8 + 2], qs, 2); - pack_q8_0_quants(&x[i * 8 + 3], qs, 3); - pack_q8_0_quants(&x[i * 8 + 4], qs, 4); - pack_q8_0_quants(&x[i * 8 + 5], qs, 5); - pack_q8_0_quants(&x[i * 8 + 6], qs, 6); - pack_q8_0_quants(&x[i * 8 + 7], qs, 7); - } - - // Repack the scales - // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2) - // the last block is truncated and overridden by the scales. - for (int i = 0; i < nb; i++) { - // Unpack the scales - const ggml_half * d = (const ggml_half *) (y_d + i * dblk_size); - x[i * 8 + 0].d = d[0]; - x[i * 8 + 1].d = d[1]; - x[i * 8 + 2].d = d[2]; - x[i * 8 + 3].d = d[3]; - x[i * 8 + 4].d = d[4]; - x[i * 8 + 5].d = d[5]; - x[i * 8 + 6].d = d[6]; - x[i * 8 + 7].d = d[7]; - } - - if (opt_verbose > 2) { - for (int i = 0; i < nb; i++) { - dump_block_q8_0(&x[i * 8 + 0], 0); - dump_block_q8_0(&x[i * 8 + 1], 1); - dump_block_q8_0(&x[i * 8 + 2], 2); - dump_block_q8_0(&x[i * 8 + 3], 3); - dump_block_q8_0(&x[i * 8 + 4], 4); - dump_block_q8_0(&x[i * 8 + 5], 5); - dump_block_q8_0(&x[i * 8 + 6], 6); - dump_block_q8_0(&x[i * 8 + 7], 7); - } - } -} - -static void init_row_q8x4x2(block_q8_0 * x, int64_t k) { - static const int qk = QK_Q8_0x4x2; - const int nb = (k + qk - 1) / qk; // number of blocks (padded) - - // Init the quants such that they unpack into zeros - uint8_t qs[QK_Q8_0x4x2]; // unpacked quants - memset(qs, 0, sizeof(qs)); - - for (int i = 0; i < nb; i++) { - pack_q8_0_quants(&x[i * 8 + 0], qs, 0); - pack_q8_0_quants(&x[i * 8 + 1], qs, 1); - pack_q8_0_quants(&x[i * 8 + 2], qs, 2); - pack_q8_0_quants(&x[i * 8 + 3], qs, 3); - pack_q8_0_quants(&x[i * 8 + 4], qs, 4); - pack_q8_0_quants(&x[i * 8 + 5], qs, 5); - pack_q8_0_quants(&x[i * 8 + 6], qs, 6); - pack_q8_0_quants(&x[i * 8 + 7], qs, 7); - } - - // Init the scales - // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q8_0x4x2) - // the last block is truncated and overridden by the scales. - for (int i = 0; i < nb; i++) { - // Unpack the scales - x[i * 8 + 0].d = 0; - x[i * 8 + 1].d = 0; - x[i * 8 + 2].d = 0; - x[i * 8 + 3].d = 0; - x[i * 8 + 4].d = 0; - x[i * 8 + 5].d = 0; - x[i * 8 + 6].d = 0; - x[i * 8 + 7].d = 0; - } -} - -// repack q8_0 data into q8x4x2 tensor -static void repack_q8_0_q8x4x2(ggml_tensor * t, const void * data, size_t size) { - int64_t nrows = ggml_nrows(t); - - size_t row_size = ggml_row_size(t->type, t->ne[0]); - size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q8_0x4x2)); // extra elements for the pad - size_t row_size_rp = row_size_pd; // scratch must hold one full padded tile (qblk_size quants + scales) - - // Ensure we don't try to read more data than is available in the source buffer 'data' - // or write more than the tensor can hold. - const size_t total_tensor_size = (size_t)nrows * row_size; - const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size; - - // Calculate how many full rows and how many remaining bytes we need to process. - const int64_t n_full_rows = n_bytes_to_copy / row_size; - const size_t n_rem_bytes = n_bytes_to_copy % row_size; - - void * buf_pd = ggml_aligned_malloc(row_size_pd); - GGML_ASSERT(buf_pd != NULL); - - void * buf_rp = ggml_aligned_malloc(row_size_rp); - GGML_ASSERT(buf_rp != NULL); - - HEX_VERBOSE("ggml-hex: repack-q8_0-q8x4x2 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data, size, - t->ne[0], nrows, row_size); - - init_row_q8x4x2((block_q8_0 *) buf_pd, t->ne[0]); // init padded buffer to make sure the tail is all zeros - - // 1. Process all the full rows - for (int64_t i = 0; i < n_full_rows; i++) { - const uint8_t * src = (const uint8_t *) data + (i * row_size); - uint8_t * dst = (uint8_t *) t->data + (i * row_size); - - memcpy(buf_pd, src, row_size); - repack_row_q8x4x2((uint8_t *) buf_rp, (const block_q8_0 *) buf_pd, t->ne[0]); - memcpy(dst, buf_rp, row_size); - } - - // 2. Process the final, potentially partial, row - if (n_rem_bytes > 0) { - const int64_t i = n_full_rows; - const uint8_t * src = (const uint8_t *) data + (i * row_size); - uint8_t * dst = (uint8_t *) t->data + (i * row_size); - - // re-init the row because we are potentially copying a partial row - init_row_q8x4x2((block_q8_0 *) buf_pd, t->ne[0]); - - // Copy only the remaining bytes from the source. - memcpy(buf_pd, src, n_rem_bytes); - - // Repack the entire buffer - repack_row_q8x4x2((uint8_t *) buf_rp, (const block_q8_0 *) buf_pd, t->ne[0]); - - // Write only the corresponding remaining bytes to the destination tensor. - memcpy(dst, buf_rp, n_rem_bytes); - } - - ggml_aligned_free(buf_pd, row_size_pd); - ggml_aligned_free(buf_rp, row_size_rp); -} - -// repack q8x4x2 tensor into q8_0 data -static void repack_q8x4x2_q8_0(void * data, const ggml_tensor * t, size_t size) { - int64_t nrows = ggml_nrows(t); - - size_t row_size = ggml_row_size(t->type, t->ne[0]); - size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_Q8_0x4x2)); // extra elements for the pad - size_t row_size_rp = row_size_pd; // scratch must hold one full padded tile (qblk_size quants + scales) - - // Ensure we don't try to copy more data than the tensor actually contains. - const size_t total_tensor_size = (size_t)nrows * row_size; - const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size; - - // Calculate how many full rows and how many remaining bytes we need to process. - const int64_t n_full_rows = n_bytes_to_copy / row_size; - const size_t n_rem_bytes = n_bytes_to_copy % row_size; - - void * buf_pd = ggml_aligned_malloc(row_size_pd); - GGML_ASSERT(buf_pd != NULL); - - void * buf_rp = ggml_aligned_malloc(row_size_rp); - GGML_ASSERT(buf_rp != NULL); - - HEX_VERBOSE("ggml-hex: repack-q8x4x2-q8_0 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data, size, - t->ne[0], nrows, row_size); - - memset(buf_pd, 0, row_size_pd); // clear-out padded buffer to make sure the tail is all zeros - - // 1. Process all the full rows - for (int64_t i = 0; i < n_full_rows; i++) { - const uint8_t * src = (const uint8_t *) t->data + (i * row_size); - uint8_t * dst = (uint8_t *) data + (i * row_size); - - memcpy(buf_pd, src, row_size); - unpack_row_q8x4x2((block_q8_0 *) buf_rp, (const uint8_t *) buf_pd, t->ne[0]); - memcpy(dst, buf_rp, row_size); - } - - // 2. Process the final, potentially partial, row - if (n_rem_bytes > 0) { - const int64_t i = n_full_rows; - const uint8_t * src = (const uint8_t *) t->data + (i * row_size); - uint8_t * dst = (uint8_t *) data + (i * row_size); - - // We still need to read and unpack the entire source row because quantization is block-based. - memcpy(buf_pd, src, row_size); - unpack_row_q8x4x2((block_q8_0 *) buf_rp, (const uint8_t *) buf_pd, t->ne[0]); - - // But we only copy the remaining number of bytes to the destination. - memcpy(dst, buf_rp, n_rem_bytes); - } - - ggml_aligned_free(buf_pd, row_size_pd); - ggml_aligned_free(buf_rp, row_size_rp); -} - -// ======== MXFP4x4x2 ==================== -struct x2_mxfp4 { - int v[2]; -}; - -static x2_mxfp4 unpack_mxfp4(uint8_t v) { - x2_mxfp4 x; - x.v[0] = kvalues_mxfp4[(v & 0x0f)]; - x.v[1] = kvalues_mxfp4[(v >> 4)]; - return x; -} - -static void dump_block_mxfp4(const block_mxfp4 * b, int i) { - HEX_VERBOSE("ggml-hex: repack mxfp4 %d: %d %d %d %d ... %d %d %d %d : %.6f\n", i, unpack_mxfp4(b->qs[0]).v[0], - unpack_mxfp4(b->qs[1]).v[0], unpack_mxfp4(b->qs[2]).v[0], unpack_mxfp4(b->qs[3]).v[0], - unpack_mxfp4(b->qs[12]).v[1], unpack_mxfp4(b->qs[13]).v[1], unpack_mxfp4(b->qs[14]).v[1], - unpack_mxfp4(b->qs[15]).v[1], GGML_E8M0_TO_FP32_HALF(b->e)); -} - -static void dump_packed_block_mxfp4x4x2(const uint8_t * v, unsigned int i, size_t k) { - static const int qk = QK_MXFP4x4x2; - const int eblk_size = 8 * 1; // 8x E8M0 - const int qblk_size = qk / 2; // int4 - const int qrow_size = k / 2; // int4 (not padded) - - const uint8_t * v_q = v + 0; // quants first - const uint8_t * v_e = v + qrow_size; // then scales - - const uint8_t * q = v_q + i * qblk_size; - const uint8_t * e = (const uint8_t *) (v_e + i * eblk_size); - - HEX_VERBOSE("ggml-hex: repack mxfp4x4x2-%d: %d %d %d %d ... %d %d %d %d ... %d %d %d %d : %.6f %.6f %.6f %.6f\n", i, - unpack_mxfp4(q[0]).v[0], unpack_mxfp4(q[1]).v[0], unpack_mxfp4(q[2]).v[0], unpack_mxfp4(q[3]).v[0], - unpack_mxfp4(q[60]).v[0], unpack_mxfp4(q[61]).v[0], unpack_mxfp4(q[62]).v[0], unpack_mxfp4(q[63]).v[0], - unpack_mxfp4(q[124]).v[0], unpack_mxfp4(q[125]).v[0], unpack_mxfp4(q[126]).v[0], - unpack_mxfp4(q[127]).v[0], GGML_E8M0_TO_FP32_HALF(e[0]), GGML_E8M0_TO_FP32_HALF(e[1]), - GGML_E8M0_TO_FP32_HALF(e[2]), GGML_E8M0_TO_FP32_HALF(e[3])); - - HEX_VERBOSE("ggml-hex: repack mxfp4x4x2-%d: %d %d %d %d ... %d %d %d %d ... %d %d %d %d : %.6f %.6f %.6f %.6f\n", - i + 1, unpack_mxfp4(q[0]).v[1], unpack_mxfp4(q[1]).v[1], unpack_mxfp4(q[2]).v[1], - unpack_mxfp4(q[3]).v[1], unpack_mxfp4(q[60]).v[1], unpack_mxfp4(q[61]).v[1], unpack_mxfp4(q[62]).v[1], - unpack_mxfp4(q[63]).v[1], unpack_mxfp4(q[124]).v[1], unpack_mxfp4(q[125]).v[1], - unpack_mxfp4(q[126]).v[1], unpack_mxfp4(q[127]).v[1], GGML_E8M0_TO_FP32_HALF(e[4]), - GGML_E8M0_TO_FP32_HALF(e[5]), GGML_E8M0_TO_FP32_HALF(e[6]), GGML_E8M0_TO_FP32_HALF(e[7])); -} - static void unpack_mxfp4_quants(uint8_t * qs, const block_mxfp4 * x, unsigned int bi) { static const int qk = QK_MXFP4; for (unsigned int i = 0; i < qk / 2; ++i) { - const uint8_t x0 = (x->qs[i] & 0x0F); - const uint8_t x1 = (x->qs[i] >> 4); + const int x0 = (x->qs[i] & 0x0F); + const int x1 = (x->qs[i] >> 4); qs[bi * qk + i + 0] = x0; qs[bi * qk + i + qk / 2] = x1; } } static void pack_mxfp4_quants(block_mxfp4 * x, const uint8_t * qs, unsigned int bi) { - static const int qk = QK4_0; + static const int qk = QK_MXFP4; for (unsigned int i = 0; i < qk / 2; ++i) { const uint8_t x0 = qs[bi * qk + i + 0]; @@ -1288,299 +435,419 @@ static void pack_mxfp4_quants(block_mxfp4 * x, const uint8_t * qs, unsigned int } } -static void repack_row_mxfp4x4x2(uint8_t * y, const block_mxfp4 * x, int64_t k) { - static const int qk = QK_MXFP4x4x2; - const int nb = (k + qk - 1) / qk; // number of blocks (padded) - const int nloe = k % qk; // leftovers - - const int eblk_size = 8 * 1; // 8x E8M0 - const int qblk_size = qk / 2; // int4 - const int qrow_size = k / 2; // int4 (not padded to blocks) - - uint8_t * y_q = y + 0; // quants first - uint8_t * y_e = y + qrow_size; // then scales - - if (opt_verbose > 2) { - for (int i = 0; i < nb; i++) { - dump_block_mxfp4(&x[i * 8 + 0], 0); - dump_block_mxfp4(&x[i * 8 + 1], 1); - dump_block_mxfp4(&x[i * 8 + 2], 2); - dump_block_mxfp4(&x[i * 8 + 3], 3); - dump_block_mxfp4(&x[i * 8 + 4], 4); - dump_block_mxfp4(&x[i * 8 + 5], 5); - dump_block_mxfp4(&x[i * 8 + 6], 6); - dump_block_mxfp4(&x[i * 8 + 7], 7); - } - } - - // Repack the quants - for (int i = 0; i < nb; i++) { - uint8_t qs[QK_MXFP4x4x2]; // unpacked quants - - unpack_mxfp4_quants(qs, &x[i * 8 + 0], 0); - unpack_mxfp4_quants(qs, &x[i * 8 + 1], 1); - unpack_mxfp4_quants(qs, &x[i * 8 + 2], 2); - unpack_mxfp4_quants(qs, &x[i * 8 + 3], 3); - unpack_mxfp4_quants(qs, &x[i * 8 + 4], 4); - unpack_mxfp4_quants(qs, &x[i * 8 + 5], 5); - unpack_mxfp4_quants(qs, &x[i * 8 + 6], 6); - unpack_mxfp4_quants(qs, &x[i * 8 + 7], 7); - - bool partial = (nloe && i == nb-1); - - uint8_t * q = y_q + (i * qblk_size); - for (int j = 0; j < qk / 2; j++) { - q[j] = partial ? (qs[j*2+1] << 4) | qs[j*2+0] : (qs[j+128] << 4) | qs[j+000]; - } - } - - // Repack the scales - // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_MXFP4x4x2) - // the last block is truncated and overridden by the scales. - for (int i = 0; i < nb; i++) { - // Repack the scales - uint8_t * e = (uint8_t *) (y_e + i * eblk_size); - e[0] = x[i * 8 + 0].e; - e[1] = x[i * 8 + 1].e; - e[2] = x[i * 8 + 2].e; - e[3] = x[i * 8 + 3].e; - e[4] = x[i * 8 + 4].e; - e[5] = x[i * 8 + 5].e; - e[6] = x[i * 8 + 6].e; - e[7] = x[i * 8 + 7].e; - } - - if (opt_verbose > 2) { - for (int i = 0; i < nb; i++) { - dump_packed_block_mxfp4x4x2(y, i, k); - } - } -} - -static void unpack_row_mxfp4x4x2(block_mxfp4 * x, const uint8_t * y, int64_t k) { - static const int qk = QK_MXFP4x4x2; - const int nb = (k + qk - 1) / qk; // number of blocks (padded) - const int nloe = k % qk; // leftovers - - const int eblk_size = 8 * 1; // 8x E8M0 - const int qblk_size = qk / 2; // int4 - const int qrow_size = k / 2; // int4 (not padded to blocks) - - const uint8_t * y_q = y + 0; // quants first - const uint8_t * y_e = y + qrow_size; // then scales - - if (opt_verbose > 2) { - for (int i = 0; i < nb; i++) { - dump_packed_block_mxfp4x4x2(y, i, k); - } - } - - // Unpack the quants - for (int i = 0; i < nb; i++) { - uint8_t qs[QK_MXFP4x4x2]; // unpacked quants - - bool partial = (nloe && i == nb-1); - - const uint8_t * q = y_q + (i * qblk_size); - for (int j = 0; j < qk / 2; j++) { - if (partial) { - qs[j*2+0] = q[j] & 0xf; - qs[j*2+1] = q[j] >> 4; - } else { - qs[j+000] = q[j] & 0xf; - qs[j+128] = q[j] >> 4; +// repack q4_0 data into q4_0_tiled tensor +static void repack_q4_0_tiled(ggml_tensor * t, const void * data, size_t size) { + const block_q4_0 * src_matrix = (const block_q4_0 *) data; + int64_t ne0 = t->ne[0]; + int64_t ne1 = t->ne[1]; + int64_t ne2 = t->ne[2]; + int64_t ne3 = t->ne[3]; + int64_t ne0_padded = hex_round_up(ne0, 32); + int64_t ne1_padded = hex_round_up(ne1, 32); + + int n_col_tiles = ne1_padded / 32; + int n_k_tiles = ne0_padded / 32; + const size_t tile_size = HTP_MM_WEIGHT_TILE_SIZE_Q4_0; + const size_t matrix_size = n_col_tiles * n_k_tiles * tile_size; + + for (int i3 = 0; i3 < ne3; i3++) { + for (int i2 = 0; i2 < ne2; i2++) { + const block_q4_0 * src_expert = src_matrix + (i3 * ne2 + i2) * (ne1 * (ne0 / 32)); + uint8_t * matrix_dst = (uint8_t *) t->data + (i3 * ne2 + i2) * matrix_size; + + for (int ct = 0; ct < n_col_tiles; ct++) { + for (int kt = 0; kt < n_k_tiles; kt++) { + uint8_t * tile_dst = matrix_dst + (ct * n_k_tiles + kt) * tile_size; + + uint8_t tile_quants[32][32]; + for (int row = 0; row < 32; row++) { + int64_t r = ct * 32 + row; + if (r < ne1 && kt < ne0 / 32) { + unpack_q4_0_quants(tile_quants[row], &src_expert[r * (ne0 / 32) + kt], 0); + } else { + memset(tile_quants[row], 8, 32); + } + } + + for (int cp = 0; cp < 16; cp++) { + for (int row = 0; row < 32; row++) { + tile_dst[cp * 32 + row] = (tile_quants[row][2 * cp + 1] << 4) | tile_quants[row][2 * cp]; + } + } + + ggml_half * scale_dst = (ggml_half *)(tile_dst + 512); + for (int row = 0; row < 32; row++) { + int64_t r = ct * 32 + row; + scale_dst[row] = (r < ne1 && kt < ne0 / 32) ? src_expert[r * (ne0 / 32) + kt].d : 0; + } + } } } - - pack_mxfp4_quants(&x[i * 8 + 0], qs, 0); - pack_mxfp4_quants(&x[i * 8 + 1], qs, 1); - pack_mxfp4_quants(&x[i * 8 + 2], qs, 2); - pack_mxfp4_quants(&x[i * 8 + 3], qs, 3); - pack_mxfp4_quants(&x[i * 8 + 4], qs, 4); - pack_mxfp4_quants(&x[i * 8 + 5], qs, 5); - pack_mxfp4_quants(&x[i * 8 + 6], qs, 6); - pack_mxfp4_quants(&x[i * 8 + 7], qs, 7); - } - - // Repack the scales - // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_MXFP4_0x4x2) - // the last block is truncated and overridden by the scales. - for (int i = 0; i < nb; i++) { - // Unpack the scales - const uint8_t * e = (const uint8_t *) (y_e + i * eblk_size); - x[i * 8 + 0].e = e[0]; - x[i * 8 + 1].e = e[1]; - x[i * 8 + 2].e = e[2]; - x[i * 8 + 3].e = e[3]; - x[i * 8 + 4].e = e[4]; - x[i * 8 + 5].e = e[5]; - x[i * 8 + 6].e = e[6]; - x[i * 8 + 7].e = e[7]; - } - - if (opt_verbose > 2) { - for (int i = 0; i < nb; i++) { - dump_block_mxfp4(&x[i * 8 + 0], 0); - dump_block_mxfp4(&x[i * 8 + 1], 1); - dump_block_mxfp4(&x[i * 8 + 2], 2); - dump_block_mxfp4(&x[i * 8 + 3], 3); - dump_block_mxfp4(&x[i * 8 + 4], 4); - dump_block_mxfp4(&x[i * 8 + 5], 5); - dump_block_mxfp4(&x[i * 8 + 6], 6); - dump_block_mxfp4(&x[i * 8 + 7], 7); - } - } -} - -static void init_row_mxfp4x4x2(block_mxfp4 * x, int64_t k) { - static const int qk = QK_MXFP4x4x2; - const int nb = (k + qk - 1) / qk; // number of blocks (padded) - - // Init the quants such that they unpack into zeros - uint8_t qs[QK_MXFP4x4x2]; // unpacked quants - memset(qs, 0, sizeof(qs)); - - for (int i = 0; i < nb; i++) { - pack_mxfp4_quants(&x[i * 8 + 0], qs, 0); - pack_mxfp4_quants(&x[i * 8 + 1], qs, 1); - pack_mxfp4_quants(&x[i * 8 + 2], qs, 2); - pack_mxfp4_quants(&x[i * 8 + 3], qs, 3); - pack_mxfp4_quants(&x[i * 8 + 4], qs, 4); - pack_mxfp4_quants(&x[i * 8 + 5], qs, 5); - pack_mxfp4_quants(&x[i * 8 + 6], qs, 6); - pack_mxfp4_quants(&x[i * 8 + 7], qs, 7); } +} - // Init the scales - // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_MXFP4x4x2) - // the last block is truncated and overridden by the scales. - for (int i = 0; i < nb; i++) { - // Unpack the scales - x[i * 8 + 0].e = 0; - x[i * 8 + 1].e = 0; - x[i * 8 + 2].e = 0; - x[i * 8 + 3].e = 0; - x[i * 8 + 4].e = 0; - x[i * 8 + 5].e = 0; - x[i * 8 + 6].e = 0; - x[i * 8 + 7].e = 0; - } -} - -// repack mxfp4 data into mxfp4x4x2 tensor -static void repack_mxfp4_mxfp4x4x2(ggml_tensor * t, const void * data, size_t size) { - int64_t nrows = ggml_nrows(t); - - size_t row_size = ggml_row_size(t->type, t->ne[0]); - size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_MXFP4x4x2)); // extra elements for the pad - size_t row_size_rp = row_size_pd; // scratch must hold one full padded tile (qblk_size/2 quants + scales) - - // Ensure we don't try to read more data than is available in the source buffer 'data' - // or write more than the tensor can hold. - const size_t total_tensor_size = (size_t)nrows * row_size; - const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size; - - // Calculate how many full rows and how many remaining bytes we need to process. - const int64_t n_full_rows = n_bytes_to_copy / row_size; - const size_t n_rem_bytes = n_bytes_to_copy % row_size; - - void * buf_pd = ggml_aligned_malloc(row_size_pd); - GGML_ASSERT(buf_pd != NULL); - - void * buf_rp = ggml_aligned_malloc(row_size_rp); - GGML_ASSERT(buf_rp != NULL); - - HEX_VERBOSE("ggml-hex: repack-mxfp4-mxfp4x4x2 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data, - size, t->ne[0], nrows, row_size); - - init_row_mxfp4x4x2((block_mxfp4 *) buf_pd, t->ne[0]); // init padded buffer to make sure the tail is all zeros - - // 1. Process all the full rows - for (int64_t i = 0; i < n_full_rows; i++) { - const uint8_t * src = (const uint8_t *) data + (i * row_size); - uint8_t * dst = (uint8_t *) t->data + (i * row_size); - - memcpy(buf_pd, src, row_size); - repack_row_mxfp4x4x2((uint8_t *) buf_rp, (const block_mxfp4 *) buf_pd, t->ne[0]); - memcpy(dst, buf_rp, row_size); - } - - // 2. Process the final, potentially partial, row - if (n_rem_bytes > 0) { - const int64_t i = n_full_rows; - const uint8_t * src = (const uint8_t *) data + (i * row_size); - uint8_t * dst = (uint8_t *) t->data + (i * row_size); - - // re-init the row because we are potentially copying a partial row - init_row_mxfp4x4x2((block_mxfp4 *) buf_pd, t->ne[0]); - - // Copy only the remaining bytes from the source. - memcpy(buf_pd, src, n_rem_bytes); - - // Repack the entire buffer (partial data + zero padding). - repack_row_mxfp4x4x2((uint8_t *) buf_rp, (const block_mxfp4 *) buf_pd, t->ne[0]); - - // Write only the corresponding remaining bytes to the destination tensor. - memcpy(dst, buf_rp, n_rem_bytes); - } - - ggml_aligned_free(buf_pd, row_size_pd); - ggml_aligned_free(buf_rp, row_size_rp); -} - -// repack mxfp4x4x2 tensor into mxfp4 data -static void repack_mxfp4x4x2_mxfp4(void * data, const ggml_tensor * t, size_t size) { - int64_t nrows = ggml_nrows(t); - - size_t row_size = ggml_row_size(t->type, t->ne[0]); - size_t row_size_pd = ggml_row_size(t->type, hex_round_up(t->ne[0], QK_MXFP4x4x2)); // extra elements for the pad - size_t row_size_rp = row_size_pd; // scratch must hold one full padded tile (qblk_size/2 quants + scales) - - // Ensure we don't try to copy more data than the tensor actually contains. - const size_t total_tensor_size = (size_t)nrows * row_size; - const size_t n_bytes_to_copy = size < total_tensor_size ? size : total_tensor_size; - - // Calculate how many full rows and how many remaining bytes we need to process. - const int64_t n_full_rows = n_bytes_to_copy / row_size; - const size_t n_rem_bytes = n_bytes_to_copy % row_size; - - void * buf_pd = ggml_aligned_malloc(row_size_pd); - GGML_ASSERT(buf_pd != NULL); - - void * buf_rp = ggml_aligned_malloc(row_size_rp); - GGML_ASSERT(buf_rp != NULL); - - HEX_VERBOSE("ggml-hex: repack-mxfp4x4x2-mxfp4 %s : data %p size %zu dims %ldx%ld row-size %zu\n", t->name, data, - size, t->ne[0], nrows, row_size); - - memset(buf_pd, 0, row_size_pd); // clear-out padded buffer to make sure the tail is all zeros +// repack q4_0_tiled tensor into q4_0 data +static void repack_tiled_q4_0(void * data, const ggml_tensor * t, size_t size) { + block_q4_0 * dst_matrix = (block_q4_0 *) data; + int64_t ne0 = t->ne[0]; + int64_t ne1 = t->ne[1]; + int64_t ne2 = t->ne[2]; + int64_t ne3 = t->ne[3]; + int64_t ne0_padded = hex_round_up(ne0, 32); + int64_t ne1_padded = hex_round_up(ne1, 32); + + int n_col_tiles = ne1_padded / 32; + int n_k_tiles = ne0_padded / 32; + const size_t tile_size = HTP_MM_WEIGHT_TILE_SIZE_Q4_0; + const size_t matrix_size = n_col_tiles * n_k_tiles * tile_size; + + for (int i3 = 0; i3 < ne3; i3++) { + for (int i2 = 0; i2 < ne2; i2++) { + block_q4_0 * dst_expert = dst_matrix + (i3 * ne2 + i2) * (ne1 * (ne0 / 32)); + const uint8_t * matrix_src = (const uint8_t *) t->data + (i3 * ne2 + i2) * matrix_size; + + for (int ct = 0; ct < n_col_tiles; ct++) { + for (int kt = 0; kt < n_k_tiles; kt++) { + const uint8_t * tile_src = matrix_src + (ct * n_k_tiles + kt) * tile_size; + + uint8_t tile_quants[32][32]; + for (int cp = 0; cp < 16; cp++) { + for (int row = 0; row < 32; row++) { + uint8_t val = tile_src[cp * 32 + row]; + tile_quants[row][2 * cp + 0] = val & 0x0F; + tile_quants[row][2 * cp + 1] = val >> 4; + } + } + + for (int row = 0; row < 32; row++) { + int64_t r = ct * 32 + row; + if (r < ne1 && kt < ne0 / 32) { + pack_q4_0_quants(&dst_expert[r * (ne0 / 32) + kt], tile_quants[row], 0); + } + } + + const ggml_half * scale_src = (const ggml_half *)(tile_src + 512); + for (int row = 0; row < 32; row++) { + int64_t r = ct * 32 + row; + if (r < ne1 && kt < ne0 / 32) { + dst_expert[r * (ne0 / 32) + kt].d = scale_src[row]; + } + } + } + } + } + } +} - // 1. Process all the full rows - for (int64_t i = 0; i < n_full_rows; i++) { - const uint8_t * src = (const uint8_t *) t->data + (i * row_size); - uint8_t * dst = (uint8_t *) data + (i * row_size); +// repack q4_1 data into q4_1_tiled tensor +static void repack_q4_1_tiled(ggml_tensor * t, const void * data, size_t size) { + const block_q4_1 * src_matrix = (const block_q4_1 *) data; + int64_t ne0 = t->ne[0]; + int64_t ne1 = t->ne[1]; + int64_t ne2 = t->ne[2]; + int64_t ne3 = t->ne[3]; + int64_t ne0_padded = hex_round_up(ne0, 32); + int64_t ne1_padded = hex_round_up(ne1, 32); + + int n_col_tiles = ne1_padded / 32; + int n_k_tiles = ne0_padded / 32; + const size_t tile_size = HTP_MM_WEIGHT_TILE_SIZE_Q4_1; + const size_t matrix_size = n_col_tiles * n_k_tiles * tile_size; + + for (int i3 = 0; i3 < ne3; i3++) { + for (int i2 = 0; i2 < ne2; i2++) { + const block_q4_1 * src_expert = src_matrix + (i3 * ne2 + i2) * (ne1 * (ne0 / 32)); + uint8_t * matrix_dst = (uint8_t *) t->data + (i3 * ne2 + i2) * matrix_size; + + for (int ct = 0; ct < n_col_tiles; ct++) { + for (int kt = 0; kt < n_k_tiles; kt++) { + uint8_t * tile_dst = matrix_dst + (ct * n_k_tiles + kt) * tile_size; + + uint8_t tile_quants[32][32]; + for (int row = 0; row < 32; row++) { + int64_t r = ct * 32 + row; + if (r < ne1 && kt < ne0 / 32) { + unpack_q4_1_quants(tile_quants[row], &src_expert[r * (ne0 / 32) + kt], 0); + } else { + memset(tile_quants[row], 0, 32); + } + } + + for (int cp = 0; cp < 16; cp++) { + for (int row = 0; row < 32; row++) { + tile_dst[cp * 32 + row] = (tile_quants[row][2 * cp + 1] << 4) | tile_quants[row][2 * cp]; + } + } + + ggml_half * scale_dst = (ggml_half *)(tile_dst + 512); + for (int row = 0; row < 32; row++) { + int64_t r = ct * 32 + row; + if (r < ne1 && kt < ne0 / 32) { + scale_dst[2 * row + 0] = src_expert[r * (ne0 / 32) + kt].d; + scale_dst[2 * row + 1] = src_expert[r * (ne0 / 32) + kt].m; + } else { + scale_dst[2 * row + 0] = 0; + scale_dst[2 * row + 1] = 0; + } + } + } + } + } + } +} - memcpy(buf_pd, src, row_size); - unpack_row_mxfp4x4x2((block_mxfp4 *) buf_rp, (const uint8_t *) buf_pd, t->ne[0]); - memcpy(dst, buf_rp, row_size); +// repack q4_1_tiled tensor into q4_1 data +static void repack_tiled_q4_1(void * data, const ggml_tensor * t, size_t size) { + block_q4_1 * dst_matrix = (block_q4_1 *) data; + int64_t ne0 = t->ne[0]; + int64_t ne1 = t->ne[1]; + int64_t ne2 = t->ne[2]; + int64_t ne3 = t->ne[3]; + int64_t ne0_padded = hex_round_up(ne0, 32); + int64_t ne1_padded = hex_round_up(ne1, 32); + + int n_col_tiles = ne1_padded / 32; + int n_k_tiles = ne0_padded / 32; + const size_t tile_size = HTP_MM_WEIGHT_TILE_SIZE_Q4_1; + const size_t matrix_size = n_col_tiles * n_k_tiles * tile_size; + + for (int i3 = 0; i3 < ne3; i3++) { + for (int i2 = 0; i2 < ne2; i2++) { + block_q4_1 * dst_expert = dst_matrix + (i3 * ne2 + i2) * (ne1 * (ne0 / 32)); + const uint8_t * matrix_src = (const uint8_t *) t->data + (i3 * ne2 + i2) * matrix_size; + + for (int ct = 0; ct < n_col_tiles; ct++) { + for (int kt = 0; kt < n_k_tiles; kt++) { + const uint8_t * tile_src = matrix_src + (ct * n_k_tiles + kt) * tile_size; + + uint8_t tile_quants[32][32]; + for (int cp = 0; cp < 16; cp++) { + for (int row = 0; row < 32; row++) { + uint8_t val = tile_src[cp * 32 + row]; + tile_quants[row][2 * cp + 0] = val & 0x0F; + tile_quants[row][2 * cp + 1] = val >> 4; + } + } + + for (int row = 0; row < 32; row++) { + int64_t r = ct * 32 + row; + if (r < ne1 && kt < ne0 / 32) { + pack_q4_1_quants(&dst_expert[r * (ne0 / 32) + kt], tile_quants[row], 0); + } + } + + const ggml_half * scale_src = (const ggml_half *)(tile_src + 512); + for (int row = 0; row < 32; row++) { + int64_t r = ct * 32 + row; + if (r < ne1 && kt < ne0 / 32) { + dst_expert[r * (ne0 / 32) + kt].d = scale_src[2 * row]; + dst_expert[r * (ne0 / 32) + kt].m = scale_src[2 * row + 1]; + } + } + } + } + } } +} - // 2. Process the final, potentially partial, row - if (n_rem_bytes > 0) { - const int64_t i = n_full_rows; - const uint8_t * src = (const uint8_t *) t->data + (i * row_size); - uint8_t * dst = (uint8_t *) data + (i * row_size); +// repack q8_0 data into q8_0_tiled tensor +static void repack_q8_0_tiled(ggml_tensor * t, const void * data, size_t size) { + const block_q8_0 * src_matrix = (const block_q8_0 *) data; + int64_t ne0 = t->ne[0]; + int64_t ne1 = t->ne[1]; + int64_t ne2 = t->ne[2]; + int64_t ne3 = t->ne[3]; + int64_t ne0_padded = hex_round_up(ne0, 32); + int64_t ne1_padded = hex_round_up(ne1, 32); + + int n_col_tiles = ne1_padded / 32; + int n_k_tiles = ne0_padded / 32; + const size_t tile_size = HTP_MM_WEIGHT_TILE_SIZE_Q8_0; + const size_t matrix_size = n_col_tiles * n_k_tiles * tile_size; + + for (int i3 = 0; i3 < ne3; i3++) { + for (int i2 = 0; i2 < ne2; i2++) { + const block_q8_0 * src_expert = src_matrix + (i3 * ne2 + i2) * (ne1 * (ne0 / 32)); + uint8_t * matrix_dst = (uint8_t *) t->data + (i3 * ne2 + i2) * matrix_size; + + for (int ct = 0; ct < n_col_tiles; ct++) { + for (int kt = 0; kt < n_k_tiles; kt++) { + uint8_t * tile_dst = matrix_dst + (ct * n_k_tiles + kt) * tile_size; + + for (int cp = 0; cp < 16; cp++) { + int col0 = cp * 2; + int col1 = col0 + 1; + for (int row = 0; row < 32; row++) { + int64_t r = ct * 32 + row; + const block_q8_0 * b = (r < ne1 && kt < ne0 / 32) ? &src_expert[r * (ne0 / 32) + kt] : NULL; + tile_dst[cp * 64 + 2 * row + 0] = b ? b->qs[col0] : 0; + tile_dst[cp * 64 + 2 * row + 1] = b ? b->qs[col1] : 0; + } + } + + ggml_half * scale_dst = (ggml_half *)(tile_dst + 1024); + for (int row = 0; row < 32; row++) { + int64_t r = ct * 32 + row; + scale_dst[row] = (r < ne1 && kt < ne0 / 32) ? src_expert[r * (ne0 / 32) + kt].d : 0; + } + } + } + } + } +} - // We still need to read and unpack the entire source row because the format is block-based. - memcpy(buf_pd, src, row_size); - unpack_row_mxfp4x4x2((block_mxfp4 *) buf_rp, (const uint8_t *) buf_pd, t->ne[0]); +// repack q8_0_tiled tensor into q8_0 data +static void repack_tiled_q8_0(void * data, const ggml_tensor * t, size_t size) { + block_q8_0 * dst_matrix = (block_q8_0 *) data; + int64_t ne0 = t->ne[0]; + int64_t ne1 = t->ne[1]; + int64_t ne2 = t->ne[2]; + int64_t ne3 = t->ne[3]; + int64_t ne0_padded = hex_round_up(ne0, 32); + int64_t ne1_padded = hex_round_up(ne1, 32); + + int n_col_tiles = ne1_padded / 32; + int n_k_tiles = ne0_padded / 32; + const size_t tile_size = HTP_MM_WEIGHT_TILE_SIZE_Q8_0; + const size_t matrix_size = n_col_tiles * n_k_tiles * tile_size; + + for (int i3 = 0; i3 < ne3; i3++) { + for (int i2 = 0; i2 < ne2; i2++) { + block_q8_0 * dst_expert = dst_matrix + (i3 * ne2 + i2) * (ne1 * (ne0 / 32)); + const uint8_t * matrix_src = (const uint8_t *) t->data + (i3 * ne2 + i2) * matrix_size; + + for (int ct = 0; ct < n_col_tiles; ct++) { + for (int kt = 0; kt < n_k_tiles; kt++) { + const uint8_t * tile_src = matrix_src + (ct * n_k_tiles + kt) * tile_size; + + for (int cp = 0; cp < 16; cp++) { + int col0 = cp * 2; + int col1 = col0 + 1; + for (int row = 0; row < 32; row++) { + int64_t r = ct * 32 + row; + if (r < ne1 && kt < ne0 / 32) { + block_q8_0 & b = dst_expert[r * (ne0 / 32) + kt]; + b.qs[col0] = tile_src[cp * 64 + 2 * row + 0]; + b.qs[col1] = tile_src[cp * 64 + 2 * row + 1]; + } + } + } + + const ggml_half * scale_src = (const ggml_half *)(tile_src + 1024); + for (int row = 0; row < 32; row++) { + int64_t r = ct * 32 + row; + if (r < ne1 && kt < ne0 / 32) { + dst_expert[r * (ne0 / 32) + kt].d = scale_src[row]; + } + } + } + } + } + } +} - // But we only copy the remaining number of bytes to the destination to respect the size limit. - memcpy(dst, buf_rp, n_rem_bytes); +// repack mxfp4 data into mxfp4_tiled tensor +static void repack_mxfp4_tiled(ggml_tensor * t, const void * data, size_t size) { + const block_mxfp4 * src_matrix = (const block_mxfp4 *) data; + int64_t ne0 = t->ne[0]; + int64_t ne1 = t->ne[1]; + int64_t ne2 = t->ne[2]; + int64_t ne3 = t->ne[3]; + int64_t ne0_padded = hex_round_up(ne0, 32); + int64_t ne1_padded = hex_round_up(ne1, 32); + + int n_col_tiles = ne1_padded / 32; + int n_k_tiles = ne0_padded / 32; + const size_t tile_size = HTP_MM_WEIGHT_TILE_SIZE_MXFP4; + const size_t matrix_size = n_col_tiles * n_k_tiles * tile_size; + + for (int i3 = 0; i3 < ne3; i3++) { + for (int i2 = 0; i2 < ne2; i2++) { + const block_mxfp4 * src_expert = src_matrix + (i3 * ne2 + i2) * (ne1 * (ne0 / 32)); + uint8_t * matrix_dst = (uint8_t *) t->data + (i3 * ne2 + i2) * matrix_size; + + for (int ct = 0; ct < n_col_tiles; ct++) { + for (int kt = 0; kt < n_k_tiles; kt++) { + uint8_t * tile_dst = matrix_dst + (ct * n_k_tiles + kt) * tile_size; + + uint8_t tile_quants[32][32]; + for (int row = 0; row < 32; row++) { + int64_t r = ct * 32 + row; + if (r < ne1 && kt < ne0 / 32) { + unpack_mxfp4_quants(tile_quants[row], &src_expert[r * (ne0 / 32) + kt], 0); + } else { + memset(tile_quants[row], 0, 32); + } + } + + for (int cp = 0; cp < 16; cp++) { + for (int row = 0; row < 32; row++) { + tile_dst[cp * 32 + row] = (tile_quants[row][2 * cp + 1] << 4) | tile_quants[row][2 * cp]; + } + } + + uint8_t * scale_dst = tile_dst + 512; + for (int row = 0; row < 32; row++) { + int64_t r = ct * 32 + row; + scale_dst[row] = (r < ne1 && kt < ne0 / 32) ? src_expert[r * (ne0 / 32) + kt].e : 0; + } + } + } + } } +} - ggml_aligned_free(buf_pd, row_size_pd); - ggml_aligned_free(buf_rp, row_size_rp); +// repack mxfp4_tiled tensor into mxfp4 data +static void repack_tiled_mxfp4(void * data, const ggml_tensor * t, size_t size) { + block_mxfp4 * dst_matrix = (block_mxfp4 *) data; + int64_t ne0 = t->ne[0]; + int64_t ne1 = t->ne[1]; + int64_t ne2 = t->ne[2]; + int64_t ne3 = t->ne[3]; + int64_t ne0_padded = hex_round_up(ne0, 32); + int64_t ne1_padded = hex_round_up(ne1, 32); + + int n_col_tiles = ne1_padded / 32; + int n_k_tiles = ne0_padded / 32; + const size_t tile_size = HTP_MM_WEIGHT_TILE_SIZE_MXFP4; + const size_t matrix_size = n_col_tiles * n_k_tiles * tile_size; + + for (int i3 = 0; i3 < ne3; i3++) { + for (int i2 = 0; i2 < ne2; i2++) { + block_mxfp4 * dst_expert = dst_matrix + (i3 * ne2 + i2) * (ne1 * (ne0 / 32)); + const uint8_t * matrix_src = (const uint8_t *) t->data + (i3 * ne2 + i2) * matrix_size; + + for (int ct = 0; ct < n_col_tiles; ct++) { + for (int kt = 0; kt < n_k_tiles; kt++) { + const uint8_t * tile_src = matrix_src + (ct * n_k_tiles + kt) * tile_size; + + uint8_t tile_quants[32][32]; + for (int cp = 0; cp < 16; cp++) { + for (int row = 0; row < 32; row++) { + uint8_t val = tile_src[cp * 32 + row]; + tile_quants[row][2 * cp + 0] = val & 0x0F; + tile_quants[row][2 * cp + 1] = val >> 4; + } + } + + for (int row = 0; row < 32; row++) { + int64_t r = ct * 32 + row; + if (r < ne1 && kt < ne0 / 32) { + pack_mxfp4_quants(&dst_expert[r * (ne0 / 32) + kt], tile_quants[row], 0); + } + } + + const uint8_t * scale_src = tile_src + 512; + for (int row = 0; row < 32; row++) { + int64_t r = ct * 32 + row; + if (r < ne1 && kt < ne0 / 32) { + dst_expert[r * (ne0 / 32) + kt].e = scale_src[row]; + } + } + } + } + } + } } static void ggml_backend_hexagon_buffer_set_tensor(ggml_backend_buffer_t buffer, @@ -1597,32 +864,32 @@ static void ggml_backend_hexagon_buffer_set_tensor(ggml_backend_buffer_t buffer, case GGML_TYPE_Q4_0: GGML_ASSERT(offset == 0); GGML_ASSERT(offset + size <= ggml_nbytes(tensor)); - repack_q4_0_q4x4x2(tensor, data, size); + repack_q4_0_tiled(tensor, data, size); break; case GGML_TYPE_Q4_1: GGML_ASSERT(offset == 0); GGML_ASSERT(offset + size <= ggml_nbytes(tensor)); - repack_q4_1_q4x4x2(tensor, data, size); + repack_q4_1_tiled(tensor, data, size); break; case GGML_TYPE_Q8_0: GGML_ASSERT(offset == 0); GGML_ASSERT(offset + size <= ggml_nbytes(tensor)); - repack_q8_0_q8x4x2(tensor, data, size); + repack_q8_0_tiled(tensor, data, size); break; case GGML_TYPE_IQ4_NL: GGML_ASSERT(offset == 0); GGML_ASSERT(offset + size <= ggml_nbytes(tensor)); // IQ4_NL has identical block layout to Q4_0 (ggml_half d + uint8_t qs[16]) - repack_q4_0_q4x4x2(tensor, data, size); + repack_q4_0_tiled(tensor, data, size); break; case GGML_TYPE_MXFP4: GGML_ASSERT(offset == 0); GGML_ASSERT(offset + size <= ggml_nbytes(tensor)); - repack_mxfp4_mxfp4x4x2(tensor, data, size); + repack_mxfp4_tiled(tensor, data, size); break; default: @@ -1645,31 +912,31 @@ static void ggml_backend_hexagon_buffer_get_tensor(ggml_backend_buffer_t buffer, case GGML_TYPE_Q4_0: GGML_ASSERT(offset == 0); GGML_ASSERT(offset + size <= ggml_nbytes(tensor)); - repack_q4x4x2_q4_0(data, tensor, size); + repack_tiled_q4_0(data, tensor, size); break; case GGML_TYPE_Q4_1: GGML_ASSERT(offset == 0); GGML_ASSERT(offset + size <= ggml_nbytes(tensor)); - repack_q4x4x2_q4_1(data, tensor, size); + repack_tiled_q4_1(data, tensor, size); break; case GGML_TYPE_Q8_0: GGML_ASSERT(offset == 0); GGML_ASSERT(offset + size <= ggml_nbytes(tensor)); - repack_q8x4x2_q8_0(data, tensor, size); + repack_tiled_q8_0(data, tensor, size); break; case GGML_TYPE_IQ4_NL: GGML_ASSERT(offset == 0); GGML_ASSERT(offset + size <= ggml_nbytes(tensor)); - repack_q4x4x2_q4_0(data, tensor, size); + repack_tiled_q4_0(data, tensor, size); break; case GGML_TYPE_MXFP4: GGML_ASSERT(offset == 0); GGML_ASSERT(offset + size <= ggml_nbytes(tensor)); - repack_mxfp4x4x2_mxfp4(data, tensor, size); + repack_tiled_mxfp4(data, tensor, size); break; default: @@ -1747,12 +1014,19 @@ static size_t ggml_backend_hexagon_buffer_type_get_alignment(ggml_backend_buffer } static size_t ggml_backend_hexagon_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * t) { + if (t->type == GGML_TYPE_Q4_0 || t->type == GGML_TYPE_Q4_1 || t->type == GGML_TYPE_Q8_0 || t->type == GGML_TYPE_IQ4_NL || t->type == GGML_TYPE_MXFP4) { + int64_t ne0 = hex_round_up(t->ne[0], 32); + int64_t ne1 = hex_round_up(t->ne[1], 32); + int64_t ne2 = t->ne[2]; + int64_t ne3 = t->ne[3]; + return ggml_row_size(t->type, ne0) * ne1 * ne2 * ne3; + } return ggml_nbytes(t); } static size_t ggml_backend_hexagon_buffer_type_get_max_size(ggml_backend_buffer_type_t buffer_type) { - return opt_mbuf; // typically 1GB per buffer - GGML_UNUSED(buffer_type); + auto * context = static_cast(buffer_type->context); + return context->sess->max_bufsize; } static bool ggml_backend_hexagon_buffer_type_is_host(ggml_backend_buffer_type_t buft) { @@ -1783,6 +1057,17 @@ static ggml_backend_buffer_type_i ggml_backend_hexagon_repack_buffer_type_interf /* .is_host = */ ggml_backend_hexagon_repack_buffer_type_is_host, }; +static bool ggml_backend_buffer_is_hexagon(const struct ggml_backend_buffer * b) { + return b->buft->iface.get_alignment == ggml_backend_hexagon_buffer_type_get_alignment; +} + +static inline bool ggml_backend_buffer_is_hexagon_repack(const struct ggml_backend_buffer * b) { + if (!opt_hostbuf) { + return ggml_backend_buffer_is_hexagon(b); + } + return b->buft->iface.alloc_buffer == ggml_backend_hexagon_repack_buffer_type_alloc_buffer; +} + struct ggml_hexagon_opbatch { ggml_hexagon_session* sess; @@ -1863,14 +1148,25 @@ struct ggml_hexagon_opbatch { b_vmem += b.size; - HEX_VERBOSE("ggml-hex: add-buffer #%u : fd %d base %p size %zu : vmem %zu\n", bi, b.fd, (void*) sbuf->base, (size_t) b.size, b_vmem); + HEX_VERBOSE("ggml-hex: %s add-buffer #%u : fd %d base %p size %zu : vmem %zu\n", sess->c_name(), bi, b.fd, (void*) sbuf->base, (size_t) b.size, b_vmem); return bi; } bool same_shape(const htp_tensor * h, const ggml_tensor * t) const { - return (h->ne[0] == t->ne[0]) && (h->ne[1] == t->ne[1]) && (h->ne[2] == t->ne[2]) && (h->ne[3] == t->ne[3]) && - (h->nb[0] == t->nb[0]) && (h->nb[1] == t->nb[1]) && (h->nb[2] == t->nb[2]) && (h->nb[3] == t->nb[3]); + int64_t ne0 = t->ne[0]; + int64_t ne1 = t->ne[1]; + const bool is_repack = ggml_backend_buffer_is_hexagon_repack(t->buffer) && ggml_hexagon_is_repack_type(t->type); + if (is_repack) { + ne0 = hex_round_up(ne0, 32); + ne1 = hex_round_up(ne1, 32); + } + int64_t nb1 = is_repack ? ggml_row_size(t->type, ne0) : t->nb[1]; + int64_t nb2 = is_repack ? nb1 * ne1 : t->nb[2]; + int64_t nb3 = is_repack ? nb2 * t->ne[2] : t->nb[3]; + + return (h->ne[0] == ne0) && (h->ne[1] == ne1) && (h->ne[2] == t->ne[2]) && (h->ne[3] == t->ne[3]) && + (h->nb[0] == t->nb[0]) && (h->nb[1] == nb1) && (h->nb[2] == nb2) && (h->nb[3] == nb3); } // add tensor and return its index @@ -1901,19 +1197,35 @@ struct ggml_hexagon_opbatch { htp_tensor &h = h_tens[ti]; h.bi = add_buffer(sbuf); h.data = t_offset; - h.size = t_size; h.type = t->type; - h.ne[0] = t->ne[0]; h.ne[1] = t->ne[1]; h.ne[2] = t->ne[2]; h.ne[3] = t->ne[3]; - h.nb[0] = t->nb[0]; h.nb[1] = t->nb[1]; h.nb[2] = t->nb[2]; h.nb[3] = t->nb[3]; + + const bool is_repack = ggml_backend_buffer_is_hexagon_repack(t->buffer) && ggml_hexagon_is_repack_type(t->type); + if (is_repack) { + h.ne[0] = hex_round_up(t->ne[0], 32); + h.ne[1] = hex_round_up(t->ne[1], 32); + h.ne[2] = t->ne[2]; + h.ne[3] = t->ne[3]; + + h.nb[0] = t->nb[0]; + h.nb[1] = ggml_row_size(t->type, h.ne[0]); + h.nb[2] = h.nb[1] * h.ne[1]; + h.nb[3] = h.nb[2] * h.ne[2]; + h.size = h.nb[3] * h.ne[3]; + t_size = h.size; + } else { + h.size = t_size; + h.ne[0] = t->ne[0]; h.ne[1] = t->ne[1]; h.ne[2] = t->ne[2]; h.ne[3] = t->ne[3]; + h.nb[0] = t->nb[0]; h.nb[1] = t->nb[1]; h.nb[2] = t->nb[2]; h.nb[3] = t->nb[3]; + } h.flags = 0; if (ggml_backend_buffer_get_usage(t->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) { h.flags |= HTP_TENSOR_COMPUTE; } - HEX_VERBOSE("ggml-hex: add-tensor #%u %s : bi %d data %p offset %zu size %zu flags 0x%x : %zu:%zu:%zu:%zu\n", + HEX_VERBOSE("ggml-hex: %s add-tensor #%u %s : bi %d data %p offset %zu size %zu flags 0x%x : %zu:%zu:%zu:%zu\n", sess->c_name(), ti, t->name, h.bi, (void*) t->data, (size_t) t_offset, t_size, h.flags, - (size_t) t->ne[0], (size_t) t->ne[1], (size_t) t->ne[2], (size_t) t->ne[3]); + (size_t) h.ne[0], (size_t) h.ne[1], (size_t) h.ne[2], (size_t) h.ne[3]); return ti; } @@ -1942,7 +1254,9 @@ struct ggml_hexagon_opbatch { for (const auto * src : node.get_inputs()) { fit_tensor(src); } - fit_tensor(node.dst()); + for (const auto * output : node.get_outputs()) { + fit_tensor(output); + } if ((extra_bufs + n_bufs) > n_bufs_max) return false; if ((extra_tens + n_tens) > n_tens_max) return false; @@ -1961,7 +1275,8 @@ struct ggml_hexagon_opbatch { ops[n] = node; htp_op_desc &o = h_ops[n]; - memcpy(&o.params, &node.node->op_params, sizeof(node.node->op_params)); + memcpy(o.params, node.node->op_params, sizeof(node.node->op_params)); + memcpy(o.kernel_params, node.kernel_params, sizeof(o.kernel_params)); o.opcode = node.opcode; o.flags = 0; @@ -1969,13 +1284,17 @@ struct ggml_hexagon_opbatch { o.flags |= HTP_OPFLAGS_SKIP_COMPUTE; } - ggml_hexagon_dump_op_exec(sess->c_name(), node, o.flags); + ggml_hexagon_dump_op_exec(sess->c_name(), ops[n], o.flags); auto inputs = node.get_inputs(); for (unsigned int i=0; i < HTP_OP_MAX_INPUTS; i++) { - o.src[i] = (i < inputs.size() && inputs[i]) ? add_tensor(inputs[i]) : 0xffff; + o.src[i] = (i < inputs.size() && inputs[i]) ? add_tensor(inputs[i]) : 0xffff; + } + + auto outputs = node.get_outputs(); + for (unsigned int i=0; i < HTP_OP_MAX_OUTPUTS; i++) { + o.dst[i] = (i < outputs.size() && outputs[i]) ? add_tensor(outputs[i]) : 0xffff; } - o.dst = add_tensor(node.dst()); } }; @@ -1986,19 +1305,25 @@ struct ggml_hexagon_opqueue { using opvec = std::vector; - std::queue done; // completed batch ids - std::vector op_cache; // per batch op cache - std::vector start_usec; // per batch start time + std::queue done; // completed batch ids + std::vector op_cache; // per batch op cache + std::vector start_usec; // per batch start time ggml_hexagon_opqueue(ggml_hexagon_session *sess, size_t batch_size, size_t depth) { size_t n_bufs = HTP_OP_MAX_BUFS; size_t n_ops = batch_size; - size_t n_tensors = n_ops + n_ops * HTP_OP_MAX_INPUTS; + size_t n_tensors = n_ops * HTP_OP_MAX_OUTPUTS + n_ops * HTP_OP_MAX_INPUTS; + + size_t tr_size = 0; + if (opt_profile == 3) { + tr_size = (HTP_MAX_NTHREADS + 1) * opt_optrace * sizeof(htp_trace_desc); + } shm_blk_size = sizeof(htp_buf_desc) * n_bufs + sizeof(htp_tensor) * n_tensors + sizeof(htp_op_desc) * n_ops + - sizeof(htp_prof_desc) * n_ops; + sizeof(htp_prof_desc) * n_ops + + tr_size; shm_buf = new ggml_hexagon_shared_buffer(sess, shm_blk_size * depth, true /* pinned */); @@ -2042,11 +1367,19 @@ struct ggml_hexagon_opqueue { const size_t o_size = sizeof(htp_op_desc) * req.n_ops; const size_t p_size = sizeof(htp_prof_desc) * req.n_ops; + size_t tr_size = 0; + if (opt_profile == 3) { + req.n_traces = opt_optrace; + tr_size = (HTP_MAX_NTHREADS + 1) * req.n_traces * sizeof(htp_trace_desc); + } else { + req.n_traces = 0; + } + dbuf.ptr = shm_buf->base + (req.id * shm_blk_size); dbuf.fd = shm_buf->fd; dbuf.flags = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT; dbuf.offset = (uint8_t*) dbuf.ptr - (uint8_t*) shm_buf->base; - dbuf.size = b_size + t_size + o_size + p_size; + dbuf.size = b_size + t_size + o_size + p_size + tr_size; GGML_ASSERT(dbuf.size <= shm_blk_size); @@ -2092,7 +1425,14 @@ struct ggml_hexagon_opqueue { const size_t o_size = sizeof(htp_op_desc) * rsp.n_ops; const size_t p_size = sizeof(htp_prof_desc) * rsp.n_ops; - const size_t m_size = b_size + t_size + o_size + p_size; + size_t tr_size = 0; + uint32_t n_traces = 0; + if (opt_profile == 3) { + n_traces = opt_optrace; + tr_size = (HTP_MAX_NTHREADS + 1) * n_traces * sizeof(htp_trace_desc); + } + + const size_t m_size = b_size + t_size + o_size + p_size + tr_size; GGML_ASSERT(m_size <= shm_blk_size); HEX_VERBOSE("ggml-hex: %s op-queue pop batch #%u : n-bufs %u n-tensors %u n-ops %u : m-size %zu b-size %zu t-size %zu o-size %zu\n", @@ -2111,13 +1451,62 @@ struct ggml_hexagon_opqueue { GGML_ASSERT(rsp.n_ops <= ops.size()); const htp_prof_desc * pd = (const htp_prof_desc *) p_ptr; + + const htp_trace_desc * trace_events = nullptr; + + if (opt_profile == 3) { + trace_events = (const htp_trace_desc *) (p_ptr + p_size); + } + + uint32_t trace_idx[HTP_MAX_NTHREADS + 1] = {0}; + uint32_t valid_cnt[HTP_MAX_NTHREADS + 1] = {0}; + + if (opt_profile == 3) { + for (uint32_t t = 0; t <= HTP_MAX_NTHREADS; t++) { + uint32_t count = rsp.n_traces[t]; + valid_cnt[t] = count > n_traces ? n_traces : count; + } + } + for (uint32_t i = 0; i < rsp.n_ops; i++) { htp_usec += pd[i].usecs; - ggml_hexagon_dump_op_prof(shm_buf->sess->name, ops[i], pd[i].usecs, pd[i].cycles, pd[i].pmu); + + ggml_hexagon_dump_op_prof(shm_buf->sess->name, ops[i], pd[i]); + + if (opt_profile == 3) { + uint32_t op_duration = pd[i].cycles_stop - pd[i].cycles_start; + + for (uint32_t t = 0; t <= HTP_MAX_NTHREADS; t++) { + while (trace_idx[t] < valid_cnt[t]) { + const auto & e = trace_events[t * n_traces + trace_idx[t]]; + uint32_t offset = e.cycles - pd[i].cycles_start; + if (offset >= 0x80000000) { + trace_idx[t]++; + continue; + } + if (offset > op_duration) { + break; + } + bool is_stop = (e.info & 0x8000) != 0; + uint16_t info = e.info & 0x7FFF; + GGML_LOG_DEBUG("ggml-hex: %s trace-op %s: thread %u event %s info %u %s %u\n", + shm_buf->sess->c_name(), ops[i].op_name().c_str(), t, htp_event_name(e.id), info, is_stop ? "stop" : "start", e.cycles); + trace_idx[t]++; + } + } + } + } + + char evt_str[256] = ""; + if (opt_profile == 3) { + snprintf(evt_str, sizeof(evt_str), " evt [%u,%u,%u,%u,%u,%u,%u,%u,%u,%u,%u]", + rsp.n_traces[0], rsp.n_traces[1], rsp.n_traces[2], rsp.n_traces[3], + rsp.n_traces[4], rsp.n_traces[5], rsp.n_traces[6], rsp.n_traces[7], + rsp.n_traces[8], rsp.n_traces[9], rsp.n_traces[10]); } - GGML_LOG_DEBUG("ggml-hex: %s profile-batch n-ops %u batch-dur-usec %lld htp-ops-usec %u\n", - shm_buf->sess->c_name(), rsp.n_ops, (long long) batch_usec, htp_usec); + GGML_LOG_DEBUG("ggml-hex: %s profile-batch n-ops %u batch-dur-usec %lld htp-ops-usec %u%s\n", + shm_buf->sess->c_name(), rsp.n_ops, (long long) batch_usec, htp_usec, evt_str); } } }; @@ -2134,6 +1523,7 @@ void ggml_hexagon_session::flush_pending(bool all) { // Read response packet from queue const uint32_t timeo = opt_oppoll ? 0 : DSPQUEUE_TIMEOUT; + int err = dspqueue_read(this->queue, &flags, 1, &n_dbufs, &dbuf, sizeof(rsp), &rsp_size, (uint8_t *) &rsp, timeo); if (err == AEE_EEXPIRED) { continue; @@ -2314,6 +1704,31 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) { this->valid_handle = true; + // Query HW info and resolve session options + this->max_bufsize = opt_mbuf; + { + unsigned int hw_n_threads = 0; + unsigned int hw_n_hvx = 0; + unsigned int hw_n_hmx = 0; + unsigned long long hw_vtcm_size = 0; + int hw_err = htp_iface_hwinfo(this->handle, &hw_n_threads, &hw_n_hvx, &hw_n_hmx, &hw_vtcm_size); + if (hw_err == 0) { + this->n_threads = opt_nhvx > 0 ? (uint32_t)opt_nhvx : (uint32_t)hw_n_threads; + this->n_hvx = opt_nhvx > 0 ? (uint32_t)opt_nhvx : (uint32_t)hw_n_hvx; + this->n_hmx = (opt_nhmx != 0) ? (uint32_t)hw_n_hmx : 0; + this->vtcm_size = (uint64_t)hw_vtcm_size; + GGML_LOG_INFO("ggml-hex: %s hwinfo: threads %u, hvx %u, hmx %u, vtcm %llu MB\n", + this->c_name(), this->n_threads, this->n_hvx, this->n_hmx, + (unsigned long long)(this->vtcm_size / (1024 * 1024))); + } else { + GGML_LOG_WARN("ggml-hex: %s failed to query hwinfo (0x%x), using defaults\n", this->c_name(), hw_err); + this->n_threads = opt_nhvx > 0 ? (uint32_t)opt_nhvx : 8; + this->n_hvx = opt_nhvx > 0 ? (uint32_t)opt_nhvx : 8; + this->n_hmx = (opt_nhmx != 0) ? 1 : 0; + this->vtcm_size = 8 * 1024 * 1024; + } + } + // Enable FastRPC QoS mode { struct remote_rpc_control_latency l; @@ -2378,11 +1793,12 @@ void ggml_hexagon_session::allocate(int dev_id) noexcept(false) { opt_vmem = ggml_hexagon_measure_max_vmem(this); GGML_LOG_INFO("ggml-hex: %s measured max vmem %zu\n", this->c_name(), opt_vmem); } + this->max_vmem = opt_vmem; - this->op_batch = new ggml_hexagon_opbatch(this, opt_opbatch, opt_vmem); + this->op_batch = new ggml_hexagon_opbatch(this, opt_opbatch, this->max_vmem); // Start dspqueue/opbatch processing - err = htp_iface_start(this->handle, dev_id, this->queue_id, opt_nhvx, opt_use_hmx, opt_vmem); + err = htp_iface_start(this->handle, dev_id, this->queue_id, opt_nhvx, opt_nhmx, this->max_vmem); if (err != 0) { GGML_LOG_ERROR("ggml-hex: %s failed to start session: 0x%08x\n", this->c_name(), (unsigned) err); throw std::runtime_error("ggml-hex: iface start failed (see log for details)"); @@ -2463,16 +1879,6 @@ ggml_hexagon_session::~ggml_hexagon_session() noexcept(true) { // ** backend interface -static bool ggml_backend_buffer_is_hexagon(const struct ggml_backend_buffer * b) { - return b->buft->iface.get_alignment == ggml_backend_hexagon_buffer_type_get_alignment; -} - -static inline bool ggml_backend_buffer_is_hexagon_repack(const struct ggml_backend_buffer * b) { - if (!opt_hostbuf) { - return ggml_backend_buffer_is_hexagon(b); - } - return b->buft->iface.alloc_buffer == ggml_backend_hexagon_repack_buffer_type_alloc_buffer; -} static bool ggml_hexagon_supported_flash_attn_ext(const struct ggml_hexagon_session * sess, const struct ggml_tensor * op) { const struct ggml_tensor * src0 = op->src[0]; @@ -2563,6 +1969,640 @@ static bool ggml_hexagon_supported_gated_delta_net(const struct ggml_hexagon_ses return true; } +static bool ggml_hexagon_matmul_is_hmx_eligible( + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + const struct ggml_tensor * dst, + int ne01_padded, + bool is_matmul_id, + bool is_batched +) { + const int ne00 = src0->ne[0]; + const int ne11 = src1->ne[1]; + const int ne12 = src1->ne[2]; + const int wtype = src0->type; + + // HMX weight tile requires N to be 32-aligned. + if (ne01_padded % 32 != 0) { + return false; + } + + // HMX supports F16, F32, and repack quantized types. + if (!ggml_hexagon_is_hmx_weight_type((ggml_type) wtype)) { + return false; + } + + // HMX paths require K aligned to 32. + if (ne00 % 32 != 0) { + return false; + } + + // Quantized HMX kernels only handle flat 2D matmul (or matmul_id wrapping flat 2D matmuls). + if (!is_matmul_id && is_batched && wtype != GGML_TYPE_F16) { + return false; + } + + // HMX assumes contiguous row-major layout. + if (src0->nb[0] > src0->nb[1] || src1->nb[0] > src1->nb[1]) { + return false; + } + + // M alignment: Use HMX when M > HTP_MM_HMX_MIN_NROWS + const int m = is_matmul_id ? ne12 : ne11; + if (m <= HTP_MM_HMX_MIN_NROWS) { + return false; + } + + return true; +} + +static bool ggml_hexagon_precompute_hmx_mm_params( + const struct ggml_hexagon_session * sess, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + const struct ggml_tensor * dst, + int wtype, + int ne00_padded, + int ne01_padded, + int ne02, + int ne11, + int ne12, + int ne11_padded, + bool is_matmul_id, + bool is_batched, + size_t vtcm_budget, + struct htp_mm_kernel_params * kparams +) { + const int aligned_tile_size = htp_mm_get_weight_aligned_tile_size(wtype); + const bool pipeline = is_matmul_id ? false : htp_mm_hmx_pipeline(ne11); + const int n_threads = (int)sess->n_threads; + const int ne10 = src1->ne[0]; + + const bool is_batched_val = is_matmul_id ? false : is_batched; + const int group_size = (ne02 > 0 ? ne12 / ne02 : 1); + + size_t m_chunk = 0; + size_t n_chunk = 0; + size_t vtcm_size = 0; + bool use_grouped = false; + int act_threads_selected = 0; + + if (is_batched_val && wtype == GGML_TYPE_F16 && group_size > 1) { + // Try grouped path first + const bool use_dma_activation = (src1->nb[1]/sizeof(float) > (size_t)ne00_padded); + size_t best_mblocks = SIZE_MAX; + int best_act_threads = 0; + size_t best_m_chunk = 0; + size_t best_n_chunk = 0; + size_t best_vtcm_size = 0; + + int act_threads = n_threads; + while (act_threads >= 1) { + const size_t f32_scratch_size = use_dma_activation ? hex_align_up(act_threads * HTP_MM_DMA_ACT_MULTIPLIER * ne00_padded * sizeof(float), HTP_MM_HMX_TILE_SIZE) : 0; + size_t group_overhead = 256 + f32_scratch_size; + size_t group_size_per_n, group_size_per_m, group_size_per_mn; + htp_mm_hmx_get_batched_chunk_costs(ne00_padded, group_size, &group_size_per_n, &group_size_per_m, &group_size_per_mn); + + size_t m_chunk_candidate = 0; + size_t n_chunk_candidate = 0; + size_t vtcm_size_candidate = 0; + + if (htp_mm_hmx_compute_chunks(vtcm_budget, group_overhead, group_size_per_n, group_size_per_m, group_size_per_mn, hex_align_up(ne11, 32), ne01_padded, + (size_t) ne01_padded * HTP_MM_HMX_COST_W_DEQUANT, (size_t) ne11 * HTP_MM_HMX_COST_A_CONVERT, + &m_chunk_candidate, &n_chunk_candidate, &vtcm_size_candidate) == 0) { + size_t exact_size = htp_mm_hmx_get_batched_vtcm_size(wtype, ne00_padded, m_chunk_candidate, n_chunk_candidate, group_size, use_dma_activation, pipeline, act_threads); + if (exact_size <= vtcm_budget) { + size_t mblocks = ((size_t) ne11 + m_chunk_candidate - 1) / m_chunk_candidate; + if (mblocks < best_mblocks || (mblocks == best_mblocks && act_threads > best_act_threads)) { + best_mblocks = mblocks; + best_act_threads = act_threads; + best_m_chunk = m_chunk_candidate; + best_n_chunk = n_chunk_candidate; + best_vtcm_size = exact_size; + } + } + } + if (act_threads == 1) { + act_threads = 0; + } else { + act_threads /= 2; + } + } + + if (best_act_threads > 0) { + m_chunk = best_m_chunk; + n_chunk = best_n_chunk; + vtcm_size = best_vtcm_size; + act_threads_selected = best_act_threads; + use_grouped = true; + } + } + + if (!use_grouped) { + // Fallback to simple 2D path (group_size = 1) + size_t best_mblocks = SIZE_MAX; + int best_act_threads = 0; + size_t best_m_chunk = 0; + size_t best_n_chunk = 0; + size_t best_vtcm_size = 0; + + // For MUL_MAT_ID the kernel runs one 2D matmul per expert, with M equal to the number of rows routed to that expert. + // A single expert can receive up to all routed rows (dst->ne[1]*dst->ne[2] = n_expert_used*n_tokens), so size the chunk + // search for that upper bound rather than ne12 (token positions only). + // We recompute m_chunk per expert against the actual count in the NPU kernel. + const int m_id_rows = (int) ((size_t) dst->ne[1] * dst->ne[2]); + const int m_for_chunks = is_matmul_id ? hex_align_up(m_id_rows, 32) : ne11_padded; + const int m_for_cost = is_matmul_id ? m_id_rows : ne11; + + int act_threads = n_threads; + while (act_threads >= 1) { + const size_t act_f32_size = is_matmul_id ? 0 : hex_align_up(act_threads * HTP_MM_DMA_ACT_MULTIPLIER * ne00_padded * sizeof(float), HTP_MM_HMX_TILE_SIZE); + size_t simple_2d_overhead = 256 + act_f32_size; + size_t simple_2d_size_per_n, simple_2d_size_per_m, simple_2d_size_per_mn; + htp_mm_hmx_get_2d_chunk_costs(wtype, ne00_padded, pipeline, aligned_tile_size, &simple_2d_size_per_n, &simple_2d_size_per_m, &simple_2d_size_per_mn); + + size_t m_chunk_candidate = 0; + size_t n_chunk_candidate = 0; + size_t vtcm_size_candidate = 0; + + if (htp_mm_hmx_compute_chunks(vtcm_budget, simple_2d_overhead, simple_2d_size_per_n, simple_2d_size_per_m, simple_2d_size_per_mn, m_for_chunks, ne01_padded, + (size_t) ne01_padded * HTP_MM_HMX_COST_W_DEQUANT, (size_t) m_for_cost * HTP_MM_HMX_COST_A_CONVERT, + &m_chunk_candidate, &n_chunk_candidate, &vtcm_size_candidate) == 0) { + size_t exact_size = htp_mm_hmx_get_2d_vtcm_size(wtype, ne00_padded, m_chunk_candidate, n_chunk_candidate, pipeline, is_matmul_id ? 0 : act_threads, aligned_tile_size); + if (exact_size <= vtcm_budget) { + size_t mblocks = ((size_t) m_for_cost + m_chunk_candidate - 1) / m_chunk_candidate; + if (mblocks < best_mblocks || (mblocks == best_mblocks && act_threads > best_act_threads)) { + best_mblocks = mblocks; + best_act_threads = act_threads; + best_m_chunk = m_chunk_candidate; + best_n_chunk = n_chunk_candidate; + best_vtcm_size = exact_size; + } + } + } + if (act_threads == 1) { + act_threads = 0; + } else { + act_threads /= 2; + } + } + + if (best_act_threads > 0) { + m_chunk = best_m_chunk; + n_chunk = best_n_chunk; + vtcm_size = best_vtcm_size; + act_threads_selected = best_act_threads; + } else { + return false; + } + } + + kparams->n_hmx = 1; + kparams->pipeline = pipeline ? 1 : 0; + kparams->m_chunk = m_chunk; + kparams->n_chunk = n_chunk; + kparams->n_threads = n_threads; + kparams->n_act_threads = act_threads_selected; + kparams->tile_size = htp_mm_get_weight_tile_size(wtype); + kparams->aligned_tile_size = aligned_tile_size; + kparams->src1_row_size = (wtype == GGML_TYPE_Q4_1) ? htp_mm_q8_1_tiled_row_size(ne10) : htp_mm_q8_0_tiled_row_size(ne10); + kparams->vtcm_size = vtcm_size; + kparams->vtcm_src0_size = 0; + kparams->vtcm_src1_size = 0; + kparams->vtcm_dst_size = 0; + + if (is_batched && !is_matmul_id) { + kparams->kernel_type = HTP_MM_KERNEL_HMX_F16_BATCHED; + } else { + kparams->kernel_type = HTP_MM_KERNEL_HMX_2D; + } + return true; +} + +static void ggml_hexagon_precompute_hvx_mm_params( + const struct ggml_hexagon_session * sess, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + const struct ggml_tensor * dst, + int wtype, + int ne02, + int ne03, + int ne10, + int ne11, + int ne12, + int ne13, + bool is_matmul_id, + size_t vtcm_budget, + struct htp_mm_kernel_params * kparams +) { + kparams->n_hmx = 0; + + const bool is_quant = (wtype != GGML_TYPE_F16 && wtype != GGML_TYPE_F32); + const int src1_nrows = ne11 * ne12 * ne13; + + if (is_quant) { + // Quantized HVX + kparams->tile_size = htp_mm_get_weight_tile_size(wtype); + kparams->aligned_tile_size = htp_mm_get_weight_aligned_tile_size(wtype); + + const bool k_align = (ne10 % 32 == 0); + + if (is_matmul_id) { + kparams->kernel_type = (src1_nrows < (int) sess->n_threads) ? HTP_MM_KERNEL_HVX_QUANT_BLOCK : HTP_MM_KERNEL_HVX_QUANT_ROW; + kparams->src1_row_size = (wtype == GGML_TYPE_Q4_1) ? htp_mm_q8_1_tiled_row_size(ne10) : htp_mm_q8_0_tiled_row_size(ne10); + + size_t vtcm_src0_size = 0, vtcm_src1_size = 0; + uint32_t max_prefetch = (src1_nrows > HTP_MM_HMX_MIN_NROWS) ? 2 : 16; + uint32_t best_n_prefetch = 2; + size_t total_size = 0; + for (uint32_t d = max_prefetch; d >= 2; d /= 2) { + total_size = htp_mm_hvx_id_get_vtcm_sizes( + wtype, ne10, src1_nrows, sess->n_threads, src0->nb[1], d, + &vtcm_src0_size, &vtcm_src1_size + ); + if (total_size <= vtcm_budget) { + best_n_prefetch = d; + break; + } + } + if (best_n_prefetch == 2 && total_size > vtcm_budget) { + total_size = htp_mm_hvx_id_get_vtcm_sizes( + wtype, ne10, src1_nrows, sess->n_threads, src0->nb[1], 2, + &vtcm_src0_size, &vtcm_src1_size + ); + } + kparams->n_prefetch = best_n_prefetch; + kparams->vtcm_size = total_size; + kparams->vtcm_src0_size = vtcm_src0_size; + kparams->vtcm_src1_size = vtcm_src1_size; + kparams->vtcm_dst_size = 0; + } else { + bool try_tiled = (k_align && opt_mm_select >= 2); + if (try_tiled) { + kparams->src1_row_size = (wtype == GGML_TYPE_Q4_1) ? htp_mm_q8_1_tiled_row_size(ne10) : htp_mm_q8_0_tiled_row_size(ne10); + if (src1_nrows < (int)sess->n_threads) { + kparams->kernel_type = HTP_MM_KERNEL_HVX_QUANT_BLOCK; + } else { + kparams->kernel_type = HTP_MM_KERNEL_HVX_QUANT_ROW; + } + + uint32_t max_prefetch = (src1_nrows > HTP_MM_HMX_MIN_NROWS) ? 2 : 16; + uint32_t best_n_prefetch = 2; + size_t vtcm_src0_size = 0, vtcm_src1_size = 0, vtcm_dst_size = 0; + size_t total_size = 0; + for (uint32_t d = max_prefetch; d >= 2; d /= 2) { + total_size = htp_mm_hvx_get_vtcm_sizes( + kparams->kernel_type, wtype, ne10, src1_nrows, sess->n_threads, + dst->nb[1], src0->nb[1], src1->nb[1], d, &vtcm_src0_size, &vtcm_src1_size, &vtcm_dst_size + ); + if (total_size <= vtcm_budget) { + best_n_prefetch = d; + break; + } + } + if (best_n_prefetch == 2 && total_size > vtcm_budget) { + total_size = htp_mm_hvx_get_vtcm_sizes( + kparams->kernel_type, wtype, ne10, src1_nrows, sess->n_threads, + dst->nb[1], src0->nb[1], src1->nb[1], 2, &vtcm_src0_size, &vtcm_src1_size, &vtcm_dst_size + ); + } + + kparams->n_prefetch = best_n_prefetch; + + if (total_size <= vtcm_budget) { + kparams->vtcm_size = total_size; + kparams->vtcm_src0_size = vtcm_src0_size; + kparams->vtcm_src1_size = vtcm_src1_size; + kparams->vtcm_dst_size = vtcm_dst_size; + goto done_quant; + } + HEX_VERBOSE("ggml-hex: %s HVX tiled path VTCM size needed (%zu) > budget (%zu), falling back to HVX flat\n", sess->name.c_str(), total_size, vtcm_budget); + } + + // Flat HVX fallback + { + kparams->src1_row_size = (wtype == GGML_TYPE_Q4_1) ? htp_mm_q8_1_flat_row_size(ne10) : htp_mm_q8_0_flat_row_size(ne10); + kparams->kernel_type = HTP_MM_KERNEL_HVX_QUANT_ROW_FLAT; + + size_t vtcm_src0_size = 0, vtcm_src1_size = 0, vtcm_dst_size = 0; + size_t total_size = htp_mm_hvx_get_vtcm_sizes( + kparams->kernel_type, wtype, ne10, src1_nrows, sess->n_threads, + dst->nb[1], src0->nb[1], src1->nb[1], 16, &vtcm_src0_size, &vtcm_src1_size, &vtcm_dst_size + ); + + kparams->n_prefetch = 16; + kparams->vtcm_size = total_size; + kparams->vtcm_src0_size = vtcm_src0_size; + kparams->vtcm_src1_size = vtcm_src1_size; + kparams->vtcm_dst_size = vtcm_dst_size; + } + } + + done_quant:; + } else if (wtype == GGML_TYPE_F16) { + // F16 HVX + const bool is_batched = (ne02 > 1) || (ne03 > 1); + const bool is_permuted = ggml_is_permuted(src0) || ggml_is_permuted(src1); + + size_t vtcm_src0_size = 0, vtcm_src1_size = 0, vtcm_dst_size = 0; + size_t vtcm_size = htp_mm_hvx_get_vtcm_sizes( + HTP_MM_KERNEL_HVX_F16_F16_VTCM, wtype, ne10, src1_nrows, sess->n_threads, + dst->nb[1], src0->nb[1], src1->nb[1], 16, &vtcm_src0_size, &vtcm_src1_size, &vtcm_dst_size + ); + + if (!is_batched && !is_permuted && vtcm_size <= vtcm_budget) { + kparams->kernel_type = HTP_MM_KERNEL_HVX_F16_F16_VTCM; + kparams->src1_row_size = hex_round_up(ne10 * 2, 128); + kparams->vtcm_size = vtcm_size; + kparams->vtcm_src0_size = vtcm_src0_size; + kparams->vtcm_src1_size = vtcm_src1_size; + kparams->vtcm_dst_size = vtcm_dst_size; + kparams->n_prefetch = 16; + } else { + if (src1->type == GGML_TYPE_F32) { + kparams->kernel_type = HTP_MM_KERNEL_HVX_F16_F32_DDR; + } else { + kparams->kernel_type = HTP_MM_KERNEL_HVX_F16_F16_DDR; + } + kparams->src1_row_size = src1->nb[1]; + size_t ddr_size = htp_mm_hvx_get_vtcm_sizes( + kparams->kernel_type, wtype, ne10, src1_nrows, sess->n_threads, + dst->nb[1], src0->nb[1], src1->nb[1], 16, &vtcm_src0_size, &vtcm_src1_size, &vtcm_dst_size + ); + kparams->vtcm_size = ddr_size; + kparams->vtcm_src0_size = vtcm_src0_size; + kparams->vtcm_src1_size = vtcm_src1_size; + kparams->vtcm_dst_size = vtcm_dst_size; + kparams->n_prefetch = 16; + } + } else { + // F32 HVX + const bool is_batched = (ne02 > 1) || (ne03 > 1); + const bool is_permuted = ggml_is_permuted(src0) || ggml_is_permuted(src1); + + size_t vtcm_src0_size = 0, vtcm_src1_size = 0, vtcm_dst_size = 0; + size_t vtcm_size = htp_mm_hvx_get_vtcm_sizes( + HTP_MM_KERNEL_HVX_F32_F32_VTCM, wtype, ne10, src1_nrows, sess->n_threads, + dst->nb[1], src0->nb[1], src1->nb[1], 16, &vtcm_src0_size, &vtcm_src1_size, &vtcm_dst_size + ); + + if (!is_batched && !is_permuted && vtcm_size <= vtcm_budget) { + kparams->kernel_type = HTP_MM_KERNEL_HVX_F32_F32_VTCM; + kparams->src1_row_size = hex_round_up(ne10 * 4, 128); + kparams->vtcm_size = vtcm_size; + kparams->vtcm_src0_size = vtcm_src0_size; + kparams->vtcm_src1_size = vtcm_src1_size; + kparams->vtcm_dst_size = vtcm_dst_size; + kparams->n_prefetch = 16; + } else { + kparams->kernel_type = HTP_MM_KERNEL_HVX_F32_F32_DDR; + kparams->src1_row_size = src1->nb[1]; + size_t ddr_size = htp_mm_hvx_get_vtcm_sizes( + kparams->kernel_type, wtype, ne10, src1_nrows, sess->n_threads, + dst->nb[1], src0->nb[1], src1->nb[1], 16, &vtcm_src0_size, &vtcm_src1_size, &vtcm_dst_size + ); + kparams->vtcm_size = ddr_size; + kparams->vtcm_src0_size = vtcm_src0_size; + kparams->vtcm_src1_size = vtcm_src1_size; + kparams->vtcm_dst_size = vtcm_dst_size; + kparams->n_prefetch = 16; + } + } +} + +static void ggml_hexagon_precompute_matmul_params( + const struct ggml_hexagon_session * sess, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + const struct ggml_tensor * dst, + struct htp_mm_kernel_params * kparams +) { + memset(kparams, 0, sizeof(*kparams)); + + const int ne00 = src0->ne[0]; + const int ne01 = src0->ne[1]; + const int ne02 = src0->ne[2]; + const int ne03 = src0->ne[3]; + + const int ne10 = src1->ne[0]; + const int ne11 = src1->ne[1]; + const int ne12 = src1->ne[2]; + const int ne13 = src1->ne[3]; + + const int wtype = src0->type; + const bool is_repack = ggml_hexagon_is_repack_type((ggml_type) wtype); + const int ne00_padded = is_repack ? hex_round_up(ne00, 32) : ne00; + const int ne01_padded = is_repack ? hex_round_up(ne01, 32) : ne01; + const int ne11_padded = hex_round_up(ne11, 32); + + const bool is_matmul_id = (dst->op == GGML_OP_MUL_MAT_ID); + const bool is_batched = (ne02 * ne03 > 1 || ne12 * ne13 > 1); + + const size_t vtcm_budget = sess->vtcm_size; + + // Check HMX eligibility and try precomputing HMX parameters + bool hmx_enabled = (sess->n_hmx > 0) && (opt_mm_select >= 3); + if (hmx_enabled && ggml_hexagon_matmul_is_hmx_eligible(src0, src1, dst, ne01_padded, is_matmul_id, is_batched)) { + if (ggml_hexagon_precompute_hmx_mm_params(sess, src0, src1, dst, wtype, ne00_padded, ne01_padded, ne02, ne11, ne12, ne11_padded, is_matmul_id, is_batched, vtcm_budget, kparams)) { + goto finalize; + } + } + + // Fallback to HVX parameter computation + ggml_hexagon_precompute_hvx_mm_params(sess, src0, src1, dst, wtype, ne02, ne03, ne10, ne11, ne12, ne13, is_matmul_id, vtcm_budget, kparams); + +finalize: + kparams->div_ne12_ne1 = init_fastdiv_values(ne12 * ne11); + kparams->div_ne1 = init_fastdiv_values(ne11); + kparams->div_r2 = init_fastdiv_values(ne02 > 0 ? ne12 / ne02 : 1); + kparams->div_r3 = init_fastdiv_values(ne03 > 0 ? ne13 / ne03 : 1); + kparams->div_ne11 = init_fastdiv_values(ne11); +} + +static void ggml_hexagon_precompute_fused_qkv_params( + const struct ggml_hexagon_session * sess, + const struct ggml_tensor * src0, // Wk + const struct ggml_tensor * src1, // x + struct htp_mm_kernel_params * kparams +) { + memset(kparams, 0, sizeof(*kparams)); + + const int wtype = src0->type; + const bool is_repack = ggml_hexagon_is_repack_type((ggml_type) wtype); + + const int ne10 = src1->ne[0]; + const int src1_nrows = src1->ne[1] * src1->ne[2] * src1->ne[3]; + const size_t src1_row_size = (wtype == GGML_TYPE_Q4_1) ? htp_mm_q8_1_tiled_row_size(ne10) : htp_mm_q8_0_tiled_row_size(ne10); + const size_t src0_row_size = src0->nb[1]; + const size_t src0_row_size_padded = hex_round_up(src0_row_size, 128); + + size_t src0_sz_per_thread = 0; + size_t src2_sz_per_thread = 0; + size_t src3_sz_per_thread = 0; + uint32_t best_n_prefetch = 16; + + if (is_repack) { + uint32_t aligned_tile_size = htp_mm_get_weight_aligned_tile_size(wtype); + uint32_t n_k_tiles = hex_round_up(ne10, 32) / 32; + uint32_t tile_row_size = n_k_tiles * aligned_tile_size; + size_t src1_row_size_padded = hex_round_up(src1_row_size, QK_Q8_0_TILED * sizeof(float)); + size_t src1_sz_per_thread = hex_round_up(src1_row_size * src1_nrows, 128); + size_t src1_sz = src1_sz_per_thread; + + const uint32_t max_prefetch = (src1_nrows > HTP_MM_HMX_MIN_NROWS) ? 2 : 16; + best_n_prefetch = 2; + for (uint32_t d = max_prefetch; d >= 2; d /= 2) { + size_t repacked_vtcm_size = hex_round_up(d * tile_row_size, 128); + if (repacked_vtcm_size < src1_row_size_padded) { + repacked_vtcm_size = src1_row_size_padded; + } + size_t src0_sz = repacked_vtcm_size * sess->n_threads; + size_t src2_sz = hex_round_up(d * tile_row_size, 128) * sess->n_threads; + size_t src3_sz = hex_round_up(d * tile_row_size, 128) * sess->n_threads; + size_t tiled_vtcm_size = src0_sz + src1_sz + src2_sz + src3_sz; + + if (tiled_vtcm_size <= sess->vtcm_size) { + best_n_prefetch = d; + src0_sz_per_thread = repacked_vtcm_size; + src2_sz_per_thread = hex_round_up(d * tile_row_size, 128); + src3_sz_per_thread = hex_round_up(d * tile_row_size, 128); + break; + } + } + if (best_n_prefetch == 2 && src0_sz_per_thread == 0) { + size_t repacked_vtcm_size = hex_round_up(2 * tile_row_size, 128); + if (repacked_vtcm_size < src1_row_size_padded) { + repacked_vtcm_size = src1_row_size_padded; + } + src0_sz_per_thread = repacked_vtcm_size; + src2_sz_per_thread = hex_round_up(2 * tile_row_size, 128); + src3_sz_per_thread = hex_round_up(2 * tile_row_size, 128); + } + } else { + best_n_prefetch = 16; + src0_sz_per_thread = hex_round_up(best_n_prefetch * src0_row_size_padded, 128); + src2_sz_per_thread = hex_round_up(best_n_prefetch * src0_row_size_padded, 128); + src3_sz_per_thread = hex_round_up(best_n_prefetch * src0_row_size_padded, 128); + } + + size_t src1_sz_per_thread = hex_round_up(src1_row_size * src1_nrows, 128); + + size_t src0_sz = src0_sz_per_thread * sess->n_threads; + size_t src1_sz = src1_sz_per_thread; + size_t src2_sz = src2_sz_per_thread * sess->n_threads; + size_t src3_sz = src3_sz_per_thread * sess->n_threads; + + size_t tiled_vtcm_size = src0_sz + src1_sz + src2_sz + src3_sz; + bool try_tiled = (opt_mm_select >= 2); + if (try_tiled && tiled_vtcm_size <= sess->vtcm_size) { + kparams->kernel_type = HTP_MM_KERNEL_HVX_QUANT_ROW; + kparams->vtcm_src0_size = src0_sz; + kparams->vtcm_src1_size = src1_sz; + kparams->vtcm_src2_size = src2_sz; + kparams->vtcm_src3_size = src3_sz; + kparams->vtcm_size = tiled_vtcm_size; + kparams->n_prefetch = best_n_prefetch; + } else { + kparams->kernel_type = HTP_MM_KERNEL_HVX_QUANT_ROW_FLAT; + size_t flat_src1_row_size = (wtype == GGML_TYPE_Q4_1) ? htp_mm_q8_1_flat_row_size(ne10) : htp_mm_q8_0_flat_row_size(ne10); + size_t flat_src1_sz = hex_round_up(flat_src1_row_size * src1_nrows, 128); + kparams->vtcm_src0_size = src0_sz; + kparams->vtcm_src1_size = flat_src1_sz; + kparams->vtcm_src2_size = src2_sz; + kparams->vtcm_src3_size = src3_sz; + kparams->vtcm_size = src0_sz + flat_src1_sz + src2_sz + src3_sz; + kparams->n_prefetch = best_n_prefetch; + } +} + +static void ggml_hexagon_precompute_fused_ffn_params( + const struct ggml_hexagon_session * sess, + const struct ggml_tensor * src0, // Wgate + const struct ggml_tensor * src1, // y + struct htp_mm_kernel_params * kparams +) { + memset(kparams, 0, sizeof(*kparams)); + + const int wtype = src0->type; + const bool is_repack = ggml_hexagon_is_repack_type((ggml_type) wtype); + + const int ne10 = src1->ne[0]; + const int src1_nrows = src1->ne[1] * src1->ne[2] * src1->ne[3]; + const size_t src1_row_size = (wtype == GGML_TYPE_Q4_1) ? htp_mm_q8_1_tiled_row_size(ne10) : htp_mm_q8_0_tiled_row_size(ne10); + const size_t src0_row_size = src0->nb[1]; + const size_t src0_row_size_padded = hex_round_up(src0_row_size, 128); + + size_t src0_sz_per_thread = 0; + size_t src2_sz_per_thread = 0; + uint32_t best_n_prefetch = 16; + + if (is_repack) { + uint32_t aligned_tile_size = htp_mm_get_weight_aligned_tile_size(wtype); + uint32_t n_k_tiles = hex_round_up(ne10, 32) / 32; + uint32_t tile_row_size = n_k_tiles * aligned_tile_size; + size_t src1_row_size_padded = hex_round_up(src1_row_size, QK_Q8_0_TILED * sizeof(float)); + size_t src1_sz_per_thread = hex_round_up(src1_row_size * src1_nrows, 128); + size_t src1_sz = src1_sz_per_thread; + + const uint32_t max_prefetch = (src1_nrows > HTP_MM_HMX_MIN_NROWS) ? 2 : 16; + best_n_prefetch = 2; + for (uint32_t d = max_prefetch; d >= 2; d /= 2) { + size_t repacked_vtcm_size = hex_round_up(d * tile_row_size, 128); + if (repacked_vtcm_size < src1_row_size_padded) { + repacked_vtcm_size = src1_row_size_padded; + } + size_t src0_sz = repacked_vtcm_size * sess->n_threads; + size_t src2_sz = hex_round_up(d * tile_row_size, 128) * sess->n_threads; + size_t tiled_vtcm_size = src0_sz + src1_sz + src2_sz; + + if (tiled_vtcm_size <= sess->vtcm_size) { + best_n_prefetch = d; + src0_sz_per_thread = repacked_vtcm_size; + src2_sz_per_thread = hex_round_up(d * tile_row_size, 128); + break; + } + } + if (best_n_prefetch == 2 && src0_sz_per_thread == 0) { + size_t repacked_vtcm_size = hex_round_up(2 * tile_row_size, 128); + if (repacked_vtcm_size < src1_row_size_padded) { + repacked_vtcm_size = src1_row_size_padded; + } + src0_sz_per_thread = repacked_vtcm_size; + src2_sz_per_thread = hex_round_up(2 * tile_row_size, 128); + } + } else { + best_n_prefetch = 16; + src0_sz_per_thread = hex_round_up(best_n_prefetch * src0_row_size_padded, 128); + src2_sz_per_thread = hex_round_up(best_n_prefetch * src0_row_size_padded, 128); + } + + size_t src1_sz_per_thread = hex_round_up(src1_row_size * src1_nrows, 128); + + size_t src0_sz = src0_sz_per_thread * sess->n_threads; + size_t src1_sz = src1_sz_per_thread; + size_t src2_sz = src2_sz_per_thread * sess->n_threads; + + size_t tiled_vtcm_size = src0_sz + src1_sz + src2_sz; + bool try_tiled = (opt_mm_select >= 2); + if (try_tiled && tiled_vtcm_size <= sess->vtcm_size) { + kparams->kernel_type = HTP_MM_KERNEL_HVX_QUANT_ROW; + kparams->vtcm_src0_size = src0_sz; + kparams->vtcm_src1_size = src1_sz; + kparams->vtcm_src2_size = src2_sz; + kparams->vtcm_size = tiled_vtcm_size; + kparams->n_prefetch = best_n_prefetch; + } else { + kparams->kernel_type = HTP_MM_KERNEL_HVX_QUANT_ROW_FLAT; + size_t flat_src1_row_size = (wtype == GGML_TYPE_Q4_1) ? htp_mm_q8_1_flat_row_size(ne10) : htp_mm_q8_0_flat_row_size(ne10); + size_t flat_src1_sz = hex_round_up(flat_src1_row_size * src1_nrows, 128); + kparams->vtcm_src0_size = src0_sz; + kparams->vtcm_src1_size = flat_src1_sz; + kparams->vtcm_src2_size = src2_sz; + kparams->vtcm_size = src0_sz + flat_src1_sz + src2_sz; + kparams->n_prefetch = best_n_prefetch; + } +} + static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * sess, const struct ggml_tensor * dst) { const struct ggml_tensor * src0 = dst->src[0]; const struct ggml_tensor * src1 = dst->src[1]; @@ -2585,12 +2625,13 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s return false; } - if (ggml_nrows(src0) > 16 * 1024) { - return false; // typically the lm-head which would be too large for VTCM + // hardcoded limit to refuse the lm-head for now + if (src0->ne[1] > 32768) { + return false; } - if (ggml_nrows(src1) > 1024 || src1->ne[2] != 1 || src1->ne[3] != 1) { - return false; // no huge batches or broadcasting (for now) + if (src1->ne[2] != 1 || src1->ne[3] != 1) { + return false; // no broadcasting (for now) } // src0 (weights) must be repacked @@ -2601,16 +2642,11 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s case GGML_TYPE_F16: if (src0->nb[1] < src0->nb[0]) { - GGML_LOG_DEBUG("ggml_hexagon_supported_mul_mat: permuted F16 src0 not supported\n"); return false; } if (src1->ne[2] < src0->ne[2] || src1->ne[3] < src0->ne[3]) { - GGML_LOG_DEBUG("ggml_hexagon_supported_mul_mat: src1 broadcasting not supported\n"); return false; } - if (ggml_nrows(src1) > 1024) { - return false; // no huge batches (for now) - } break; case GGML_TYPE_F32: @@ -2618,22 +2654,24 @@ static bool ggml_hexagon_supported_mul_mat(const struct ggml_hexagon_session * s return false; } if (src0->nb[1] < src0->nb[0]) { - GGML_LOG_DEBUG("ggml_hexagon_supported_mul_mat: permuted F32 src0 not supported\n"); return false; } if (src1->ne[2] < src0->ne[2] || src1->ne[3] < src0->ne[3]) { - GGML_LOG_DEBUG("ggml_hexagon_supported_mul_mat: src1 broadcasting not supported\n"); return false; } - if (ggml_nrows(src1) > 1024) { - return false; // no huge batches (for now) - } break; default: return false; } + struct htp_mm_kernel_params kparams; + ggml_hexagon_precompute_matmul_params(sess, src0, src1, dst, &kparams); + if ((size_t)kparams.vtcm_size > sess->vtcm_size) { + HEX_VERBOSE("ggml-hex: %s supported MUL_MAT VTCM size needed (%d) > budget (%zu)\n", sess->c_name(), kparams.vtcm_size, sess->vtcm_size); + return false; + } + return true; } @@ -2667,6 +2705,13 @@ static bool ggml_hexagon_supported_mul_mat_id(const struct ggml_hexagon_session return false; } + struct htp_mm_kernel_params kparams; + ggml_hexagon_precompute_matmul_params(sess, src0, src1, dst, &kparams); + if ((size_t)kparams.vtcm_size > sess->vtcm_size) { + HEX_VERBOSE("ggml-hex: %s supported MUL_MAT_ID VTCM size needed (%d) > budget (%zu)\n", sess->c_name(), kparams.vtcm_size, sess->vtcm_size); + return false; + } + return true; } @@ -3198,47 +3243,172 @@ static inline bool op_is_compute(ggml_tensor *node) return !ggml_op_is_empty(node->op) && !ggml_is_empty(node) && (node->flags & GGML_TENSOR_FLAG_COMPUTE); } +static bool is_hmx_eligible(const ggml_tensor * t) { + if (opt_nhmx == 0) { return false; } + + const ggml_tensor * src0 = t->src[0]; + const ggml_tensor * src1 = t->src[1]; + + const int wtype = src0->type; + const bool is_repack = ggml_hexagon_is_repack_type((ggml_type) wtype); + const bool is_matmul_id = (t->op == GGML_OP_MUL_MAT_ID); + const bool is_batched = (src0->ne[2] * src0->ne[3] > 1 || src1->ne[2] * src1->ne[3] > 1); + + const int ne01_padded = is_repack ? hex_round_up(src0->ne[1], 32) : src0->ne[1]; + + return ggml_hexagon_matmul_is_hmx_eligible(src0, src1, t, ne01_padded, is_matmul_id, is_batched); +} + +static bool is_mergeable_mul_mat(const ggml_tensor * t) { + if (!t || t->op != GGML_OP_MUL_MAT) return false; + if (t->src[1]->type != GGML_TYPE_F32) return false; + return ggml_is_quantized(t->src[0]->type) && !is_hmx_eligible(t); +} + +static bool is_mergeable_mul_mat_pair(const ggml_tensor * n1, const ggml_tensor * n2) { + if (!is_mergeable_mul_mat(n1) || !is_mergeable_mul_mat(n2)) { + return false; + } + if (n1->src[1] != n2->src[1]) { + return false; + } + if (n1->src[0]->ne[0] != n2->src[0]->ne[0] || + n1->src[0]->ne[1] != n2->src[0]->ne[1]) { + return false; + } + if (n1->src[0]->type != n2->src[0]->type) { + return false; + } + return true; +} + +static bool is_qkv_mergeable(const ggml_tensor * n_q, const ggml_tensor * n_k, const ggml_tensor * n_v) { + if (!is_mergeable_mul_mat(n_q) || !is_mergeable_mul_mat(n_k) || !is_mergeable_mul_mat(n_v)) { + return false; + } + if (n_q->src[1] != n_k->src[1] || n_q->src[1] != n_v->src[1]) { + return false; + } + if (n_q->src[0]->type != n_k->src[0]->type || n_q->src[0]->type != n_v->src[0]->type) { + return false; + } + if (n_k->src[0]->ne[0] != n_v->src[0]->ne[0] || + n_k->src[0]->ne[1] != n_v->src[0]->ne[1]) { + return false; + } + if (n_q->src[0]->ne[0] != n_k->src[0]->ne[0]) { + return false; + } + return true; +} + +static bool try_fuse_node(const ggml_hexagon_session * sess, const ggml_cgraph * graph, int & i, std::vector & nodes) { + if (!opt_opfusion) { + return false; + } + + ggml_tensor * n = graph->nodes[i]; + ggml_tensor * next_node = (i + 1 < graph->n_nodes) ? graph->nodes[i + 1] : nullptr; + + if (n->op == GGML_OP_RMS_NORM && next_node) { + if (next_node->op == GGML_OP_MUL && op_is_compute(next_node) && ggml_can_fuse(graph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) { + htp_opnode node(n, {}, HTP_OP_RMS_NORM_MUL); + node.add_fused(next_node); + nodes.push_back(std::move(node)); + i++; // skip the fused MUL node + return true; + } + } + + if (is_mergeable_mul_mat(n)) { + ggml_tensor * n1 = (i + 1 < graph->n_nodes) ? graph->nodes[i + 1] : nullptr; + ggml_tensor * n2 = (i + 2 < graph->n_nodes) ? graph->nodes[i + 2] : nullptr; + if (is_qkv_mergeable(n, n1, n2)) { + struct htp_mm_kernel_params kparams; + ggml_hexagon_precompute_fused_qkv_params(sess, n1->src[0], n1->src[1], &kparams); + if ((size_t)kparams.vtcm_size <= sess->vtcm_size) { + // Reorder to KVQ: K (n1), V (n2), Q (n) + htp_opnode node(n1, {}, HTP_OP_MUL_MAT_QKV); + node.add_fused(n2, true); + node.add_fused(n, true); + memcpy(node.kernel_params, &kparams, sizeof(kparams)); + nodes.push_back(std::move(node)); + i += 2; + return true; + } else { + HEX_VERBOSE("ggml-hex: skip QKV fusion because VTCM needed (%d) > budget (%zu)\n", + kparams.vtcm_size, sess->vtcm_size); + } + } + if (is_mergeable_mul_mat_pair(n, n1)) { + struct htp_mm_kernel_params kparams; + ggml_hexagon_precompute_fused_ffn_params(sess, n->src[0], n->src[1], &kparams); + if ((size_t)kparams.vtcm_size <= sess->vtcm_size) { + htp_opnode node(n, {}, HTP_OP_MUL_MAT_FFN); + node.add_fused(n1, true); + memcpy(node.kernel_params, &kparams, sizeof(kparams)); + nodes.push_back(std::move(node)); + i += 1; + return true; + } else { + HEX_VERBOSE("ggml-hex: skip FFN fusion because VTCM needed (%d) > budget (%zu)\n", + kparams.vtcm_size, sess->vtcm_size); + } + } + } + + return false; +} + static ggml_status ggml_backend_hexagon_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) { auto sess = static_cast(backend->context); HEX_VERBOSE("ggml-hex: %s graph-compute n_nodes %d\n", sess->c_name(), graph->n_nodes); - std::vector nodes; - nodes.reserve(graph->n_nodes); - - // Fusion - for (int i = 0; i < graph->n_nodes; ++i) { - ggml_tensor * n = graph->nodes[i]; - if (!op_is_compute(n)) { - continue; - } + const std::vector * nodes_ptr = nullptr; + std::vector computed_nodes; - ggml_tensor * next_node = (i + 1 < graph->n_nodes) ? graph->nodes[i + 1] : nullptr; + // Check for cache hit + bool cache_hit = (graph->uid != 0 && sess->cached_graph.uid == graph->uid); + if (cache_hit) { + nodes_ptr = &sess->cached_graph.htp_nodes; + } else { + computed_nodes.reserve(graph->n_nodes); - htp_opnode node = { - /*.node =*/ n, - /*.fused =*/ {}, - /*.opcode =*/ HTP_OP_INVALID - }; + // Fuse and finalize + for (int i = 0; i < graph->n_nodes; ++i) { + ggml_tensor * n = graph->nodes[i]; + if (!op_is_compute(n)) { + continue; + } - if (n->op == GGML_OP_RMS_NORM && next_node) { - if (next_node->op == GGML_OP_MUL && op_is_compute(next_node) && ggml_can_fuse(graph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL })) { - node.add_fused(next_node); - node.opcode = HTP_OP_RMS_NORM_MUL; - i++; // skip the fused MUL node + if (try_fuse_node(sess, graph, i, computed_nodes)) { + continue; } - } - if (node.opcode == HTP_OP_INVALID) { + htp_opnode node(n, {}, HTP_OP_INVALID); node.opcode = op_remap_to_htp(n); + if (node.opcode == HTP_OP_MUL_MAT || node.opcode == HTP_OP_MUL_MAT_ID) { + ggml_hexagon_precompute_matmul_params(sess, + node.node->src[0], node.node->src[1], node.node, + (struct htp_mm_kernel_params *)node.kernel_params + ); + } + computed_nodes.push_back(std::move(node)); } - nodes.push_back(std::move(node)); + if (graph->uid != 0) { + sess->cached_graph.uid = graph->uid; + sess->cached_graph.htp_nodes = std::move(computed_nodes); + nodes_ptr = &sess->cached_graph.htp_nodes; + } else { + nodes_ptr = &computed_nodes; + } } // Queue and execute if (opt_opstage & HTP_OPSTAGE_QUEUE) { - for (const auto & node : nodes) { + for (const auto & node : *nodes_ptr) { sess->enqueue_op(node); } } @@ -3901,15 +4071,19 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) { const char * str_opbatch = getenv("GGML_HEXAGON_OPBATCH"); const char * str_opqueue = getenv("GGML_HEXAGON_OPQUEUE"); const char * str_oppoll = getenv("GGML_HEXAGON_OPPOLL"); + const char * str_opfusion = getenv("GGML_HEXAGON_OPFUSION"); const char * str_opfilter = getenv("GGML_HEXAGON_OPFILTER"); const char * str_profile = getenv("GGML_HEXAGON_PROFILE"); const char * str_etm = getenv("GGML_HEXAGON_ETM"); const char * str_nhvx = getenv("GGML_HEXAGON_NHVX"); const char * str_use_hmx = getenv("GGML_HEXAGON_USE_HMX"); + const char * str_nhmx = getenv("GGML_HEXAGON_NHMX"); + const char * str_mm_select = getenv("GGML_HEXAGON_MM_SELECT"); const char * str_ndev = getenv("GGML_HEXAGON_NDEV"); const char * str_arch = getenv("GGML_HEXAGON_ARCH"); const char * str_vmem = getenv("GGML_HEXAGON_VMEM"); const char * str_mbuf = getenv("GGML_HEXAGON_MBUF"); + const char * str_optrace = getenv("GGML_HEXAGON_OPTRACE"); // Init Arch first since it affects other defaults if (!str_arch) { @@ -3938,11 +4112,14 @@ static void ggml_hexagon_init(ggml_backend_reg * reg) { opt_opstage = str_opstage ? strtoul(str_opstage, NULL, 0) : opt_opstage; opt_opbatch = str_opbatch ? strtoul(str_opbatch, NULL, 0) : opt_opbatch; opt_opqueue = str_opqueue ? strtoul(str_opqueue, NULL, 0) : opt_opqueue; + opt_optrace = str_optrace ? strtoul(str_optrace, NULL, 0) : (opt_opbatch * 128); opt_oppoll = str_oppoll ? strtoul(str_oppoll, NULL, 0) : opt_oppoll; + opt_opfusion = str_opfusion ? atoi(str_opfusion) : opt_opfusion; opt_profile = str_profile ? atoi(str_profile) : 0; opt_etm = str_etm ? atoi(str_etm) : 0; opt_nhvx = str_nhvx ? strtoul(str_nhvx, NULL, 0) : opt_nhvx; - opt_use_hmx = str_use_hmx ? atoi(str_use_hmx) : opt_use_hmx; + opt_nhmx = str_nhmx ? atoi(str_nhmx) : (str_use_hmx ? atoi(str_use_hmx) : opt_nhmx); + opt_mm_select = str_mm_select ? atoi(str_mm_select) : opt_mm_select; opt_ndev = str_ndev ? strtoul(str_ndev, NULL, 0) : opt_ndev; opt_hostbuf = str_hostbuf ? atoi(str_hostbuf) : opt_hostbuf; opt_mbuf = str_mbuf ? strtoul(str_mbuf, NULL, 0) * MiB : opt_mbuf; diff --git a/ggml/src/ggml-hexagon/htp-opnode.h b/ggml/src/ggml-hexagon/htp-opnode.h index 52c727c6206b..6fe23b0d6aac 100644 --- a/ggml/src/ggml-hexagon/htp-opnode.h +++ b/ggml/src/ggml-hexagon/htp-opnode.h @@ -5,10 +5,12 @@ #include "ggml-backend-impl.h" #include "ggml-common.h" +#include #include #include #include #include "htp-ops.h" +#include "htp/matmul-ops.h" struct htp_opnode { ggml_tensor * node = nullptr; @@ -17,6 +19,13 @@ struct htp_opnode { htp_op_code opcode = HTP_OP_INVALID; + std::vector extra_dsts; + + int32_t kernel_params[HTP_OP_MAX_KERN_PARAMS] = {0}; + + htp_opnode(ggml_tensor * node = nullptr, std::vector fused = {}, htp_op_code opcode = HTP_OP_INVALID, std::vector extra_dsts = {}) + : node(node), fused(std::move(fused)), opcode(opcode), extra_dsts(std::move(extra_dsts)) {} + ggml_op op() const { return node->op; } @@ -25,6 +34,26 @@ struct htp_opnode { return fused.empty() ? node : fused.back(); } + void add_fused(ggml_tensor * t, bool extra_dst = false) { + fused.push_back(t); + if (extra_dst) { + extra_dsts.push_back(t); + } + } + + std::vector get_outputs() const { + std::vector res; + if (extra_dsts.empty()) { + res.push_back(dst()); + } else { + res.push_back(node); + for (const auto * x : extra_dsts) { + res.push_back(x); + } + } + return res; + } + const ggml_tensor * src0() const { return node->src[0]; } @@ -37,10 +66,6 @@ struct htp_opnode { return ggml_op_is_empty(node->op); } - void add_fused(ggml_tensor * t) { - fused.push_back(t); - } - bool stackable() const { switch (this->op()) { case GGML_OP_MUL_MAT: @@ -131,87 +156,117 @@ struct htp_opformat { char types[16 * GGML_MAX_SRC]; char buffs[64 * GGML_MAX_SRC]; char names[64 * GGML_MAX_SRC]; + char kparams[128]; - int format_tensor_dims(char * str, const struct ggml_tensor * t) { + int format_tensor_dims(char * str, size_t max_size, const struct ggml_tensor * t) { if (!t) { - return sprintf(str, "NONE"); + return snprintf(str, max_size, "NONE"); } if (t->ne[2] == 1 && t->ne[3] == 1) { - return sprintf(str, "%d:%d", (int) t->ne[0], (int) t->ne[1]); + return snprintf(str, max_size, "%d:%d", (int) t->ne[0], (int) t->ne[1]); } else { - return sprintf(str, "%d:%d:%d:%d", (int) t->ne[0], (int) t->ne[1], (int) t->ne[2], (int) t->ne[3]); + return snprintf(str, max_size, "%d:%d:%d:%d", (int) t->ne[0], (int) t->ne[1], (int) t->ne[2], (int) t->ne[3]); } } - void format_op_dims(char * str, const htp_opnode & node) { + void format_op_dims(char * str, size_t max_size, const htp_opnode & node) { char * p = str; + char * p_end = str + max_size; auto inputs = node.get_inputs(); if (!inputs.empty()) { - p += format_tensor_dims(p, inputs[0]); + p += std::min((size_t)format_tensor_dims(p, p_end - p, inputs[0]), (size_t)(p_end - p)); for (size_t i = 1; i < inputs.size(); i++) { - p += sprintf(p, " x "); - p += format_tensor_dims(p, inputs[i]); + if (p < p_end) { + p += std::min((size_t)snprintf(p, p_end - p, " x "), (size_t)(p_end - p)); + } + if (p < p_end) { + p += std::min((size_t)format_tensor_dims(p, p_end - p, inputs[i]), (size_t)(p_end - p)); + } } - p += sprintf(p, " -> "); + if (p < p_end) { + p += std::min((size_t)snprintf(p, p_end - p, " -> "), (size_t)(p_end - p)); + } } char self[64]; - format_tensor_dims(self, node.dst()); - p += sprintf(p, "%s", self); + format_tensor_dims(self, sizeof(self), node.dst()); + if (p < p_end) { + p += std::min((size_t)snprintf(p, p_end - p, "%s", self), (size_t)(p_end - p)); + } } - int format_tensor_strides(char * str, const struct ggml_tensor * t) { + int format_tensor_strides(char * str, size_t max_size, const struct ggml_tensor * t) { if (!t) { - return sprintf(str, "NONE"); + return snprintf(str, max_size, "NONE"); } const char * c = ggml_is_contiguous(t) ? "" : "!"; if (t->ne[2] == 1 && t->ne[3] == 1) { - return sprintf(str, "%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], c); + return snprintf(str, max_size, "%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], c); } else { - return sprintf(str, "%zu:%zu:%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], (size_t) t->nb[2], (size_t) t->nb[3], c); + return snprintf(str, max_size, "%zu:%zu:%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], (size_t) t->nb[2], (size_t) t->nb[3], c); } } - void format_op_strides(char * str, const htp_opnode & node) { + void format_op_strides(char * str, size_t max_size, const htp_opnode & node) { char * p = str; + char * p_end = str + max_size; auto inputs = node.get_inputs(); if (!inputs.empty()) { - p += format_tensor_strides(p, inputs[0]); + p += std::min((size_t)format_tensor_strides(p, p_end - p, inputs[0]), (size_t)(p_end - p)); for (size_t i = 1; i < inputs.size(); i++) { - p += sprintf(p, " x "); - p += format_tensor_strides(p, inputs[i]); + if (p < p_end) { + p += std::min((size_t)snprintf(p, p_end - p, " x "), (size_t)(p_end - p)); + } + if (p < p_end) { + p += std::min((size_t)format_tensor_strides(p, p_end - p, inputs[i]), (size_t)(p_end - p)); + } } - p += sprintf(p, " -> "); + if (p < p_end) { + p += std::min((size_t)snprintf(p, p_end - p, " -> "), (size_t)(p_end - p)); + } } char self[64]; - format_tensor_strides(self, node.dst()); - p += sprintf(p, "%s", self); + format_tensor_strides(self, sizeof(self), node.dst()); + if (p < p_end) { + p += std::min((size_t)snprintf(p, p_end - p, "%s", self), (size_t)(p_end - p)); + } } - void format_op_types(char * str, const htp_opnode & node) { + void format_op_types(char * str, size_t max_size, const htp_opnode & node) { char * p = str; + char * p_end = str + max_size; auto inputs = node.get_inputs(); if (!inputs.empty()) { - p += sprintf(p, "%s", inputs[0] ? ggml_type_name(inputs[0]->type) : "NONE"); + if (p < p_end) { + p += std::min((size_t)snprintf(p, p_end - p, "%s", inputs[0] ? ggml_type_name(inputs[0]->type) : "NONE"), (size_t)(p_end - p)); + } for (size_t i = 1; i < inputs.size(); i++) { - p += sprintf(p, " x "); - p += sprintf(p, "%s", inputs[i] ? ggml_type_name(inputs[i]->type) : "NONE"); + if (p < p_end) { + p += std::min((size_t)snprintf(p, p_end - p, " x "), (size_t)(p_end - p)); + } + if (p < p_end) { + p += std::min((size_t)snprintf(p, p_end - p, "%s", inputs[i] ? ggml_type_name(inputs[i]->type) : "NONE"), (size_t)(p_end - p)); + } } - p += sprintf(p, " -> "); + if (p < p_end) { + p += std::min((size_t)snprintf(p, p_end - p, " -> "), (size_t)(p_end - p)); + } } - p += sprintf(p, "%s", ggml_type_name(node.dst()->type)); + if (p < p_end) { + p += std::min((size_t)snprintf(p, p_end - p, "%s", ggml_type_name(node.dst()->type)), (size_t)(p_end - p)); + } } const char * tensor_buff_name(const struct ggml_tensor * t) { @@ -221,51 +276,102 @@ struct htp_opformat { return "NONE"; } - void format_op_buffs(char * str, const htp_opnode & node) { + void format_op_buffs(char * str, size_t max_size, const htp_opnode & node) { char * p = str; + char * p_end = str + max_size; auto inputs = node.get_inputs(); if (!inputs.empty()) { - p += sprintf(p, "%s", tensor_buff_name(inputs[0])); + if (p < p_end) { + p += std::min((size_t)snprintf(p, p_end - p, "%s", tensor_buff_name(inputs[0])), (size_t)(p_end - p)); + } for (size_t i = 1; i < inputs.size(); i++) { - p += sprintf(p, " x "); - p += sprintf(p, "%s", tensor_buff_name(inputs[i])); + if (p < p_end) { + p += std::min((size_t)snprintf(p, p_end - p, " x "), (size_t)(p_end - p)); + } + if (p < p_end) { + p += std::min((size_t)snprintf(p, p_end - p, "%s", tensor_buff_name(inputs[i])), (size_t)(p_end - p)); + } } - p += sprintf(p, " -> "); + if (p < p_end) { + p += std::min((size_t)snprintf(p, p_end - p, " -> "), (size_t)(p_end - p)); + } } - p += sprintf(p, "%s", tensor_buff_name(node.dst())); + if (p < p_end) { + p += std::min((size_t)snprintf(p, p_end - p, "%s", tensor_buff_name(node.dst())), (size_t)(p_end - p)); + } } - void format_op_names(char * str, const htp_opnode & node) { + void format_op_names(char * str, size_t max_size, const htp_opnode & node) { char * p = str; + char * p_end = str + max_size; auto inputs = node.get_inputs(); if (!inputs.empty()) { - p += sprintf(p, "%s", inputs[0] ? inputs[0]->name : "NONE"); + if (p < p_end) { + p += std::min((size_t)snprintf(p, p_end - p, "%s", inputs[0] ? inputs[0]->name : "NONE"), (size_t)(p_end - p)); + } for (size_t i = 1; i < inputs.size(); i++) { - p += sprintf(p, " x "); - p += sprintf(p, "%s", inputs[i] ? inputs[i]->name : "NONE"); + if (p < p_end) { + p += std::min((size_t)snprintf(p, p_end - p, " x "), (size_t)(p_end - p)); + } + if (p < p_end) { + p += std::min((size_t)snprintf(p, p_end - p, "%s", inputs[i] ? inputs[i]->name : "NONE"), (size_t)(p_end - p)); + } } - p += sprintf(p, " -> "); + if (p < p_end) { + p += std::min((size_t)snprintf(p, p_end - p, " -> "), (size_t)(p_end - p)); + } } - p += sprintf(p, "%s", node.dst()->name); + if (p < p_end) { + p += std::min((size_t)snprintf(p, p_end - p, "%s", node.dst()->name), (size_t)(p_end - p)); + } + } + void format_kernel_params(char * str, size_t max_size, const htp_opnode & node) { + if (node.opcode == HTP_OP_MUL_MAT || node.opcode == HTP_OP_MUL_MAT_ID || + node.opcode == HTP_OP_MUL_MAT_QKV || node.opcode == HTP_OP_MUL_MAT_FFN) { + const auto * kparams = (const struct htp_mm_kernel_params *) node.kernel_params; + const char * path = "unknown"; + int32_t type = kparams->kernel_type; + if (type == HTP_MM_KERNEL_HMX_2D || type == HTP_MM_KERNEL_HMX_F16_BATCHED) { + path = "hmx-tiled"; + } else if (type == HTP_MM_KERNEL_HVX_F16_F16_VTCM || type == HTP_MM_KERNEL_HVX_F32_F32_VTCM || + type == HTP_MM_KERNEL_HVX_QUANT_ROW || type == HTP_MM_KERNEL_HVX_QUANT_BLOCK) { + path = "hvx-tiled"; + } else if (type == HTP_MM_KERNEL_HVX_F16_F16_DDR || type == HTP_MM_KERNEL_HVX_F16_F32_DDR || + type == HTP_MM_KERNEL_HVX_F32_F32_DDR || type == HTP_MM_KERNEL_HVX_F32_F16_DDR || + type == HTP_MM_KERNEL_HVX_QUANT_ROW_FLAT) { + path = "hvx-flat"; + } + snprintf(str, max_size, "%s vtcm %d", path, (int) kparams->vtcm_size); + } else { + snprintf(str, max_size, "----"); + } } void format(const htp_opnode & node) { - format_op_dims(dims, node); - format_op_strides(strides, node); - format_op_types(types, node); - format_op_buffs(buffs, node); - format_op_names(names, node); + format_op_dims(dims, sizeof(dims), node); + format_op_strides(strides, sizeof(strides), node); + format_op_types(types, sizeof(types), node); + format_op_buffs(buffs, sizeof(buffs), node); + format_op_names(names, sizeof(names), node); + format_kernel_params(kparams, sizeof(kparams), node); } - htp_opformat() {} + htp_opformat() { + strides[0] = '\0'; + dims[0] = '\0'; + types[0] = '\0'; + buffs[0] = '\0'; + names[0] = '\0'; + kparams[0] = '\0'; + } htp_opformat(const htp_opnode & node) { format(node); } }; diff --git a/ggml/src/ggml-hexagon/htp/CMakeLists.txt b/ggml/src/ggml-hexagon/htp/CMakeLists.txt index f4b44fe1a654..c48a5b86e3bb 100644 --- a/ggml/src/ggml-hexagon/htp/CMakeLists.txt +++ b/ggml/src/ggml-hexagon/htp/CMakeLists.txt @@ -19,43 +19,9 @@ add_library(${HTP_LIB} SHARED htp_iface_skel.c worker-pool.c hex-dma.c -) - -target_compile_definitions(${HTP_LIB} PRIVATE - $,HTP_DEBUG=1,NDEBUG=1> - $,FARF_HIGH=1,> - FP32_QUANTIZE_GROUP_SIZE=${GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE}) - -if (GGML_HEXAGON_FA_EXP2_HF) - message(STATUS "ggml-htp: HMX_FA_USE_EXP2_HF=1 (use FP16 exp2 polynomial in FA softmax)") - target_compile_definitions(${HTP_LIB} PRIVATE HMX_FA_USE_EXP2_HF=1) -endif() - -# HMX acceleration: available on v73+ architectures -set(HTP_HMX_VERSIONS v73 v75 v79 v81) -list(FIND HTP_HMX_VERSIONS ${DSP_VERSION} _hmx_idx) - -if (_hmx_idx GREATER_EQUAL 0) - target_sources(${HTP_LIB} PRIVATE - hmx-matmul-ops.c - hmx-flash-attn-ops.c - hmx-queue.c - ) - - # -mhmx enables HMX instruction set (needed by files that include hmx-utils.h) - set_source_files_properties( - hmx-flash-attn-ops.c - hmx-matmul-ops.c - hmx-queue.c - PROPERTIES COMPILE_OPTIONS "-mhmx" - ) - - target_compile_definitions(${HTP_LIB} PRIVATE HTP_HAS_HMX=1) -endif() - -build_idl(htp_iface.idl ${HTP_LIB}) - -target_sources(${HTP_LIB} PRIVATE + hmx-queue.c + flash-attn-ops.c + hmx-flash-attn-ops.c matmul-ops.c binary-ops.c unary-ops.c @@ -63,7 +29,6 @@ target_sources(${HTP_LIB} PRIVATE softmax-ops.c act-ops.c rope-ops.c - flash-attn-ops.c set-rows-ops.c get-rows-ops.c cpy-ops.c @@ -79,6 +44,17 @@ target_sources(${HTP_LIB} PRIVATE pad-ops.c ) +target_compile_definitions(${HTP_LIB} PRIVATE + $,HTP_DEBUG=1,NDEBUG=1> + $,FARF_HIGH=1,>) + +if (GGML_HEXAGON_FA_EXP2_HF) + message(STATUS "ggml-htp: HMX_FA_USE_EXP2_HF=1 (use FP16 exp2 polynomial in FA softmax)") + target_compile_definitions(${HTP_LIB} PRIVATE HMX_FA_USE_EXP2_HF=1) +endif() + +build_idl(htp_iface.idl ${HTP_LIB}) + set_target_properties(${HTP_LIB} PROPERTIES EXPORT_COMPILE_COMMANDS ON) install(TARGETS ${HTP_LIB}) diff --git a/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake b/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake index ed5c198468ca..3eff2a3986e7 100644 --- a/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake +++ b/ggml/src/ggml-hexagon/htp/cmake-toolchain.cmake @@ -3,7 +3,7 @@ if (HEXAGON_TOOLCHAIN_INCLUDED) endif() set(HEXAGON_TOOLCHAIN_INCLUDED true) -#Cross Compiling for Hexagon +# Cross Compiling for Hexagon set(HEXAGON TRUE) set(CMAKE_SYSTEM_NAME QURT) set(CMAKE_SYSTEM_PROCESSOR Hexagon) @@ -14,7 +14,6 @@ set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY) set(CUSTOM_RUNELF_PATH "") -#To fix backward compatibility with EAI addon. if (NOT HEXAGON_SDK_ROOT) set(HEXAGON_SDK_ROOT $ENV{HEXAGON_SDK_ROOT}) endif() @@ -31,7 +30,6 @@ endif() file(TO_CMAKE_PATH "${HEXAGON_TOOLS_ROOT}" HEXAGON_TOOLS_ROOT) file(TO_CMAKE_PATH "${HEXAGON_SDK_ROOT}" HEXAGON_SDK_ROOT) -#Get the Binary extension of the Hexagon Toolchain if(CMAKE_HOST_SYSTEM_NAME STREQUAL Windows) set(HEXAGON_TOOLCHAIN_SUFFIX .exe) endif() @@ -48,12 +46,12 @@ set(CMAKE_TRY_COMPILE_PLATFORM_VARIABLES HEXAGON_TOOLS_ROOT ) -#QURT Related includes and linker flags +# QURT Related includes and linker flags set(V_ARCH ${HEXAGON_ARCH}) set(_QURT_INSTALL_DIR "${HEXAGON_SDK_ROOT}/rtos/qurt/ADSP${V_ARCH}MP${V_ARCH_EXTN}") set(_QURT_INSTALL_DIR "${HEXAGON_SDK_ROOT}/rtos/qurt/compute${V_ARCH}${V_ARCH_EXTN}") -if( ${TREE} MATCHES PAKMAN ) +if (${TREE} MATCHES PAKMAN) set(_QURT_INSTALL_DIR "${QURT_IMAGE_DIR}/compute${V_ARCH}${V_ARCH_EXTN}") endif() message(DEBUG "_QURT_INSTALL_DIR:${_QURT_INSTALL_DIR}") @@ -83,11 +81,9 @@ set(QURT_START_LINK_LIBS ) STRING(REPLACE ";" " " QURT_START_LINK_LIBS "${QURT_START_LINK_LIBS}") -set(QURT_END_LINK_LIBS - ${TARGET_DIR}/fini.o - ) +set(QURT_END_LINK_LIBS ${TARGET_DIR}/fini.o) -#Non QURT related includes and linker flags +# Non QURT related includes and linker flags set(TARGET_DIR_NOOS "${HEXAGON_TOOLCHAIN}/Tools/target/hexagon/lib/${HEXAGON_ARCH}") @@ -99,8 +95,10 @@ if (NOT NO_WRAP_MEM_API) set(WRAP_MEMALIGN -Wl,--wrap=memalign) endif() +set(ARCH_FLAGS "-mcpu=${V_ARCH} -m${V_ARCH} -mhvx=${V_ARCH} -mhmx") + set(PIC_SHARED_LD_FLAGS - -mcpu=${V_ARCH} -m${V_ARCH} -mhvx=${V_ARCH} + ${ARCH_FLAGS} -G0 -fpic -Wl,-Bsymbolic @@ -120,13 +118,13 @@ STRING(REPLACE ";" " " PIC_SHARED_LD_FLAGS "${PIC_SHARED_LD_FLAGS}") set(HEXAGON_PIC_SHARED_LINK_OPTIONS "${PIC_SHARED_LD_FLAGS}") -#System include paths +# System include paths include_directories(SYSTEM ${HEXAGON_SDK_ROOT}/incs) include_directories(SYSTEM ${HEXAGON_SDK_ROOT}/incs/stddef) include_directories(SYSTEM ${HEXAGON_SDK_ROOT}/ipc/fastrpc/incs) -#LLVM toolchain setup -#Compiler paths, options and architecture +# LLVM toolchain setup +# Compiler paths, options and architecture set(CMAKE_C_COMPILER ${HEXAGON_TOOLCHAIN}/Tools/bin/hexagon-clang${HEXAGON_TOOLCHAIN_SUFFIX}) set(CMAKE_CXX_COMPILER ${HEXAGON_TOOLCHAIN}/Tools/bin/hexagon-clang++${HEXAGON_TOOLCHAIN_SUFFIX}) set(CMAKE_AR ${HEXAGON_TOOLCHAIN}/Tools/bin/hexagon-ar${HEXAGON_TOOLCHAIN_SUFFIX}) @@ -137,8 +135,8 @@ set(CMAKE_PREFIX_PATH ${HEXAGON_TOOLCHAIN}/Tools/target/hexagon) set(CMAKE_SHARED_LIBRARY_SONAME_C_FLAG "-Wl,-soname,") set(CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG "-Wl,-soname,") -#Compiler Options -set(COMMON_FLAGS "-mcpu=hexagon${V_ARCH} -m${V_ARCH} -mhvx=${V_ARCH} -fvectorize -flto -Wall -Werror -fno-zero-initialized-in-bss -G0 -fdata-sections -fpic ${XQF_ARGS}") +# Compiler Options +set(COMMON_FLAGS "${ARCH_FLAGS} -fvectorize -flto -Wall -Werror -fno-zero-initialized-in-bss -G0 -fdata-sections -fpic ${XQF_ARGS}") set(CMAKE_CXX_FLAGS_DEBUG "${COMMON_FLAGS} -O0 -D_DEBUG -g") set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${COMMON_FLAGS} -O2 -g") diff --git a/ggml/src/ggml-hexagon/htp/flash-attn-ops.c b/ggml/src/ggml-hexagon/htp/flash-attn-ops.c index e996214691a5..65f7844ae33e 100644 --- a/ggml/src/ggml-hexagon/htp/flash-attn-ops.c +++ b/ggml/src/ggml-hexagon/htp/flash-attn-ops.c @@ -18,7 +18,8 @@ #include "htp-ctx.h" #include "htp-ops.h" #include "htp-ops.h" -#include "hmx-ops.h" + +int hmx_flash_attn_ext(struct htp_ops_context * octx); // Must be multiple of 32 #define FLASH_ATTN_BLOCK_SIZE (32 * 2) @@ -339,6 +340,9 @@ static void flash_attn_ext_f16_thread(unsigned int nth, unsigned int ith, void * if (ir0 >= ir1) return; + struct htp_thread_trace * tr = octx->ctx ? &octx->ctx->trace[ith] : NULL; + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, ir0); + dma_queue * dma = octx->ctx->dma[ith]; const uint32_t DK = nek0; @@ -615,6 +619,7 @@ static void flash_attn_ext_f16_thread(unsigned int nth, unsigned int ith, void * hvx_copy_f16_f32_ua(dst_ptr, (uint8_t *) VKQ32, DV); } } + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, ir0); } int op_flash_attn_ext(struct htp_ops_context * octx) { @@ -629,7 +634,6 @@ int op_flash_attn_ext(struct htp_ops_context * octx) { return HTP_STATUS_NO_SUPPORT; } -#ifdef HTP_HAS_HMX // HMX path: head_dim multiple of 64, F16 KV, and no sinks if (k->type == HTP_TYPE_F16 && v->type == HTP_TYPE_F16 && k->ne[0] % 64 == 0 && v->ne[0] % 64 == 0 && octx->src[4] == NULL) { int ret = hmx_flash_attn_ext(octx); @@ -638,7 +642,6 @@ int op_flash_attn_ext(struct htp_ops_context * octx) { } // VTCM too small or other failure -> fall through to HVX path } -#endif struct htp_fa_context factx; factx.octx = octx; diff --git a/ggml/src/ggml-hexagon/htp/hex-common.h b/ggml/src/ggml-hexagon/htp/hex-common.h new file mode 100644 index 000000000000..4714486a042f --- /dev/null +++ b/ggml/src/ggml-hexagon/htp/hex-common.h @@ -0,0 +1,80 @@ +#ifndef HEX_COMMON_H +#define HEX_COMMON_H + +#include +#include +#include + +#ifndef SIZE_MAX +#define SIZE_MAX ((size_t)-1) +#endif + +#ifndef MAX +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#endif + +#ifndef MIN +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#endif + +static inline uint32_t hex_ceil_pow2(uint32_t x) { + if (x <= 1) { return 1; } + int p = 2; + x--; + while (x >>= 1) { p <<= 1; } + return p; +} + +static inline size_t hmx_ceil_div(size_t num, size_t den) { + return (num + den - 1) / den; +} + +static inline int32_t hex_is_aligned(const void * addr, uint32_t align) { + return ((size_t) addr & (align - 1)) == 0; +} + +static inline size_t hex_align_up(size_t v, size_t align) { + return hmx_ceil_div(v, align) * align; +} + +static inline size_t hex_align_down(size_t v, size_t align) { + return (v / align) * align; +} + +static inline int32_t hex_is_one_chunk(void * addr, uint32_t n, uint32_t chunk_size) { + uint32_t left_off = (size_t) addr & (chunk_size - 1); + uint32_t right_off = left_off + n; + return right_off <= chunk_size; +} + +static inline uint32_t hex_round_up(uint32_t n, uint32_t m) { + return m * ((n + m - 1) / m); +} + +static inline size_t hex_smin(size_t a, size_t b) { + return a < b ? a : b; +} + +static inline size_t hex_smax(size_t a, size_t b) { + return a > b ? a : b; +} + +static inline void hex_swap_ptr(void ** p1, void ** p2) { + void * t = *p1; + *p1 = *p2; + *p2 = t; +} + +static inline bool hex_mul_overflow(size_t a, size_t b, size_t *out) { + if (a != 0 && b > SIZE_MAX / a) return true; + *out = a * b; + return false; +} + +static inline bool hex_add_overflow(size_t a, size_t b, size_t *out) { + if (a > SIZE_MAX - b) return true; + *out = a + b; + return false; +} + +#endif // HEX_COMMON_H diff --git a/ggml/src/ggml-hexagon/htp/hex-dma.h b/ggml/src/ggml-hexagon/htp/hex-dma.h index 7685473f4631..8031a5679c4a 100644 --- a/ggml/src/ggml-hexagon/htp/hex-dma.h +++ b/ggml/src/ggml-hexagon/htp/hex-dma.h @@ -5,6 +5,9 @@ #include #include #include +#include "hex-utils.h" + +#include "hex-profile.h" #ifdef __cplusplus extern "C" { @@ -88,6 +91,7 @@ typedef struct { uint32_t pop_idx; uint32_t capacity; uint32_t idx_mask; + struct htp_thread_trace * trace; } dma_queue; dma_queue * dma_queue_create(size_t capacity); @@ -124,13 +128,8 @@ static inline dma_ptr dma_make_ptr(void *dst, const void *src) return p; } -#if __HVX_ARCH__ < 73 -static const uint32_t dma_src_l2_bypass_on = 1; -static const uint32_t dma_dst_l2_bypass_on = 0; -#else static const uint32_t dma_src_l2_bypass_on = 1; static const uint32_t dma_dst_l2_bypass_on = 1; -#endif static inline bool dma_queue_push_single_1d(dma_queue * q, dma_ptr dptr, size_t size) { if (((q->push_idx + 1) & q->idx_mask) == q->pop_idx) { @@ -152,6 +151,7 @@ static inline bool dma_queue_push_single_1d(dma_queue * q, dma_ptr dptr, size_t q->dptr[q->push_idx] = dptr; if (size) { + htp_trace_event_start(q->trace, HTP_TRACE_EVT_DMA, q->push_idx); dmlink(q->tail, desc); q->tail = (dma_descriptor_2d *) desc; } else { @@ -202,6 +202,7 @@ static inline bool dma_queue_push_single_2d(dma_queue * q, dma_ptr dptr, size_t q->dptr[q->push_idx] = dptr; if (nrows) { + htp_trace_event_start(q->trace, HTP_TRACE_EVT_DMA, q->push_idx); dmlink(q->tail, desc); q->tail = desc; } else { @@ -223,10 +224,12 @@ static inline dma_ptr dma_queue_pop(dma_queue * q) { dma_descriptor_2d * desc = &q->desc[q->pop_idx]; // Wait for desc to complete - while (!desc->done) { - // FARF(ERROR, "dma-pop: waiting for DMA : %u\n", q->pop_idx); - dmpoll(); + if (!desc->done) { + while (!desc->done) { + dmpoll(); + } } + htp_trace_event_stop(q->trace, HTP_TRACE_EVT_DMA, q->pop_idx); dptr = q->dptr[q->pop_idx]; diff --git a/ggml/src/ggml-hexagon/htp/hex-profile.h b/ggml/src/ggml-hexagon/htp/hex-profile.h new file mode 100644 index 000000000000..8a37a4a06664 --- /dev/null +++ b/ggml/src/ggml-hexagon/htp/hex-profile.h @@ -0,0 +1,64 @@ +#ifndef HEX_PROFILE_H +#define HEX_PROFILE_H + +#include +#include +#include + +#include "hex-utils.h" +#include "htp-ops.h" + +#define HTP_TRACE_EVT_START 0 +#define HTP_TRACE_EVT_STOP 1 + +#ifndef HEX_NUM_PMU_COUNTERS +#define HEX_NUM_PMU_COUNTERS 8 +#endif + +static inline void hex_get_pmu(uint32_t counters[]) { +#if __HVX_ARCH__ >= 79 + asm volatile("%0 = upmucnt0" : "=r"(counters[0])); + asm volatile("%0 = upmucnt1" : "=r"(counters[1])); + asm volatile("%0 = upmucnt2" : "=r"(counters[2])); + asm volatile("%0 = upmucnt3" : "=r"(counters[3])); + asm volatile("%0 = upmucnt4" : "=r"(counters[4])); + asm volatile("%0 = upmucnt5" : "=r"(counters[5])); + asm volatile("%0 = upmucnt6" : "=r"(counters[6])); + asm volatile("%0 = upmucnt7" : "=r"(counters[7])); +#else + counters[0] = qurt_pmu_get(QURT_PMUCNT0); + counters[1] = qurt_pmu_get(QURT_PMUCNT1); + counters[2] = qurt_pmu_get(QURT_PMUCNT2); + counters[3] = qurt_pmu_get(QURT_PMUCNT3); + counters[4] = qurt_pmu_get(QURT_PMUCNT4); + counters[5] = qurt_pmu_get(QURT_PMUCNT5); + counters[6] = qurt_pmu_get(QURT_PMUCNT6); + counters[7] = qurt_pmu_get(QURT_PMUCNT7); +#endif +} + +struct htp_thread_trace { + uint32_t count; + uint32_t max_events; + struct htp_trace_desc * events; +}; + +static inline void htp_trace_event(struct htp_thread_trace * tr, uint16_t id, uint16_t info, uint32_t type) { + if (tr && tr->events && tr->count < tr->max_events) { + uint32_t idx = tr->count; + tr->events[idx].id = id; + tr->events[idx].info = info | (type == HTP_TRACE_EVT_STOP ? 0x8000 : 0); + tr->events[idx].cycles = (uint32_t) hex_get_cycles(); + tr->count++; + } +} + +static inline void htp_trace_event_start(struct htp_thread_trace * tr, uint16_t id, uint16_t info) { + htp_trace_event(tr, id, info, HTP_TRACE_EVT_START); +} + +static inline void htp_trace_event_stop(struct htp_thread_trace * tr, uint16_t id, uint16_t info) { + htp_trace_event(tr, id, info, HTP_TRACE_EVT_STOP); +} + +#endif /* HEX_PROFILE_H */ diff --git a/ggml/src/ggml-hexagon/htp/hex-utils.h b/ggml/src/ggml-hexagon/htp/hex-utils.h index 6239ceff4b41..07930bef6ec1 100644 --- a/ggml/src/ggml-hexagon/htp/hex-utils.h +++ b/ggml/src/ggml-hexagon/htp/hex-utils.h @@ -11,14 +11,7 @@ #include "hex-fastdiv.h" #include "hex-dump.h" - -#ifndef MAX -#define MAX(a, b) ((a) > (b) ? (a) : (b)) -#endif - -#ifndef MIN -#define MIN(a, b) ((a) < (b) ? (a) : (b)) -#endif +#include "hex-common.h" static inline uint64_t hex_get_cycles() { uint64_t cycles = 0; @@ -32,54 +25,6 @@ static inline uint64_t hex_get_pktcnt() { return pktcnt; } -static inline uint32_t hex_ceil_pow2(uint32_t x) { - if (x <= 1) { return 1; } - int p = 2; - x--; - while (x >>= 1) { p <<= 1; } - return p; -} - -static inline size_t hmx_ceil_div(size_t num, size_t den) { - return (num + den - 1) / den; -} - -static inline int32_t hex_is_aligned(const void * addr, uint32_t align) { - return ((size_t) addr & (align - 1)) == 0; -} - -static inline size_t hex_align_up(size_t v, size_t align) { - return hmx_ceil_div(v, align) * align; -} - -static inline size_t hex_align_down(size_t v, size_t align) { - return (v / align) * align; -} - -static inline int32_t hex_is_one_chunk(void * addr, uint32_t n, uint32_t chunk_size) { - uint32_t left_off = (size_t) addr & (chunk_size - 1); - uint32_t right_off = left_off + n; - return right_off <= chunk_size; -} - -static inline uint32_t hex_round_up(uint32_t n, uint32_t m) { - return m * ((n + m - 1) / m); -} - -static inline size_t hex_smin(size_t a, size_t b) { - return a < b ? a : b; -} - -static inline size_t hex_smax(size_t a, size_t b) { - return a > b ? a : b; -} - -static inline void hex_swap_ptr(void ** p1, void ** p2) { - void * t = *p1; - *p1 = *p2; - *p2 = t; -} - static inline void hex_l2fetch(const void * p, uint32_t width, uint32_t stride, uint32_t height) { const uint64_t control = Q6_P_combine_RR(stride, Q6_R_combine_RlRl(width, height)); Q6_l2fetch_AP((void *) p, control); @@ -107,31 +52,4 @@ static inline void hex_pause() { asm volatile(" pause(#255)\n"); } -#ifndef HEX_NUM_PMU_COUNTERS -#define HEX_NUM_PMU_COUNTERS 8 -#endif - -static inline void hex_get_pmu(uint32_t counters[]) { -#if __HVX_ARCH__ >= 79 - asm volatile("%0 = upmucnt0" : "=r"(counters[0])); - asm volatile("%0 = upmucnt1" : "=r"(counters[1])); - asm volatile("%0 = upmucnt2" : "=r"(counters[2])); - asm volatile("%0 = upmucnt3" : "=r"(counters[3])); - asm volatile("%0 = upmucnt4" : "=r"(counters[4])); - asm volatile("%0 = upmucnt5" : "=r"(counters[5])); - asm volatile("%0 = upmucnt6" : "=r"(counters[6])); - asm volatile("%0 = upmucnt7" : "=r"(counters[7])); -#else - counters[0] = qurt_pmu_get(QURT_PMUCNT0); - counters[1] = qurt_pmu_get(QURT_PMUCNT1); - counters[2] = qurt_pmu_get(QURT_PMUCNT2); - counters[3] = qurt_pmu_get(QURT_PMUCNT3); - counters[4] = qurt_pmu_get(QURT_PMUCNT4); - counters[5] = qurt_pmu_get(QURT_PMUCNT5); - counters[6] = qurt_pmu_get(QURT_PMUCNT6); - counters[7] = qurt_pmu_get(QURT_PMUCNT7); - // qurt_pmu_get_pmucnt(counters); -#endif -} - #endif /* HEX_UTILS_H */ diff --git a/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c b/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c index 2796564fb754..996fd5975702 100644 --- a/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c +++ b/ggml/src/ggml-hexagon/htp/hmx-flash-attn-ops.c @@ -18,7 +18,7 @@ #include "ggml-common.h" #include "hex-dma.h" #include "hex-fastdiv.h" -#include "hmx-profile.h" +#include "hex-profile.h" #include "hmx-queue.h" #include "hmx-utils.h" #include "htp-ctx.h" @@ -49,7 +49,7 @@ // g_br = hex_align_up(gqa_factor * Br, 32) replaces Br for all Q/O/S/P/D dimensions. // Layout: Q + O_ping + O_pong + K_dma*2 + V_dma*2 + K_tile + V_tile + S + P + D + vectors + scales // Mask is DMA'd into a VTCM buffer (Br rows per KV block) to avoid DDR reads in softmax. -static size_t hmx_fa_compute_vtcm_usage(size_t gqa_factor, size_t DK, size_t DV, size_t Br, size_t Bc, size_t n_threads, bool use_pipeline) { +static size_t hmx_fa_compute_vtcm_usage(size_t gqa_factor, size_t DK, size_t DV, size_t Br, size_t Bc, size_t n_threads, bool pipeline) { const size_t g_br = hex_align_up(gqa_factor * Br, HMX_FP16_TILE_N_ROWS); const size_t q_tile_size = hex_align_up(g_br * DK * sizeof(__fp16), 4096); // Q: [g_br, DK] const size_t o_tile_size = hex_align_up(g_br * DV * sizeof(__fp16), 4096); // O: [g_br, DV] x2 ping-pong @@ -70,7 +70,7 @@ static size_t hmx_fa_compute_vtcm_usage(size_t gqa_factor, size_t DK, size_t DV, + k_dma_size * 2 // K DMA x2 + v_dma_size * 2 // V DMA x2 + k_tile_size * 1 // K tiles - + v_tile_size * (use_pipeline ? 2 : 1) // V tiles (double-buffered if pipelining) + + v_tile_size * (pipeline ? 2 : 1) // V tiles (double-buffered if pipelining) + s_tile_size * 2 // S + P + d_tile_size * 1 // D (diagonal matrix) + col_vec_size * 4 // m_vec, l_vec, s_rowmax, p_rowsum @@ -290,7 +290,7 @@ static const int16_t d_tile_scatter_offsets[64] __attribute__((aligned(128))) = struct hmx_fa_context { const struct htp_ops_context * octx; - bool use_pipeline; // true when n_kv_blocks >= FA_MIN_KV_BLOCKS && n_threads >= 2 + bool pipeline; // true when n_kv_blocks >= FA_MIN_KV_BLOCKS && n_threads >= 2 uint32_t n_threads; // Op parameters @@ -367,8 +367,11 @@ static void fa_k_interleave_thread(unsigned int n, unsigned int i, void * data) return; } + struct htp_thread_trace * tr = factx->octx->ctx ? &factx->octx->ctx->trace[i] : NULL; + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, start); hmx_interleave_rows_to_tiles(factx->vtcm_k_tiles, factx->vtcm_k_fp16[args->buf_idx], total_rows, (int) factx->DK, (int) args->src_stride, start, end); + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, start); } static void fa_phase_k_interleave(struct hmx_fa_context * factx, int kv_rows, size_t src_stride, size_t buf_idx) { @@ -406,10 +409,13 @@ static void fa_v_interleave_thread(unsigned int n, unsigned int i, void * data) return; } - __fp16 * v_tiles_dest = factx->use_pipeline ? factx->vtcm_v_tiles[args->buf_idx] : factx->vtcm_v_tiles[0]; + __fp16 * v_tiles_dest = factx->pipeline ? factx->vtcm_v_tiles[args->buf_idx] : factx->vtcm_v_tiles[0]; + struct htp_thread_trace * tr = factx->octx->ctx ? &factx->octx->ctx->trace[i] : NULL; + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, start); hmx_interleave_cols_to_tiles(v_tiles_dest, factx->vtcm_v_fp16[args->buf_idx], total_rows, (int) factx->DV, (int) args->src_stride, (int) args->n_col_tiles, start, end); + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, start); } static void fa_phase_v_interleave(struct hmx_fa_context * factx, @@ -462,6 +468,9 @@ static void fa_q_load_thread(unsigned int n, unsigned int i, void * data) { return; } + struct htp_thread_trace * tr = factx->octx->ctx ? &factx->octx->ctx->trace[i] : NULL; + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, start); + const struct htp_tensor * q = args->q; const uint32_t q_start = args->q_start; const uint32_t kv_head = args->kv_head; @@ -515,6 +524,7 @@ static void fa_q_load_thread(unsigned int n, unsigned int i, void * data) { } } } + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, start); } static void fa_phase_q_load(struct hmx_fa_context * factx, @@ -566,6 +576,9 @@ static void fa_o_store_thread(unsigned int n, unsigned int i, void * data) { return; } + struct htp_thread_trace * tr = factx->octx->ctx ? &factx->octx->ctx->trace[i] : NULL; + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, start); + const struct htp_tensor * dst = args->dst; const __fp16 * o_tile_src = args->o_tile_src; const uint32_t q_start = args->q_start; @@ -611,6 +624,7 @@ static void fa_o_store_thread(unsigned int n, unsigned int i, void * data) { } } } + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, start); } static void fa_phase_o_store(struct hmx_fa_context * factx, @@ -680,6 +694,9 @@ static void fa_softmax_thread(unsigned int n, unsigned int i, void * data) { return; } + struct htp_thread_trace * tr = factx->octx->ctx ? &factx->octx->ctx->trace[i] : NULL; + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, vec_start); + // Per-thread row scratch: thread i uses bufs at offset i * 2 * stride const size_t row_buf_stride = factx->row_buf_stride; HVX_Vector * my_row_buf0 = factx->vtcm_row_bufs + i * 2 * row_buf_stride; @@ -950,6 +967,7 @@ static void fa_softmax_thread(unsigned int n, unsigned int i, void * data) { factx->vtcm_s_rowmax[r_vec_idx] = rowmax_acc_v; factx->vtcm_p_rowsum[r_vec_idx] = rowsum_acc_v; } + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, vec_start); } // Serial m/l update + build_D. Must run after softmax barrier (s_rowmax written by all threads). @@ -1245,6 +1263,7 @@ static __attribute__((noinline)) void fa_compute_slopes( // ============================================================================ int hmx_flash_attn_ext(struct htp_ops_context * octx) { + struct htp_thread_trace * tr = octx->ctx ? &octx->ctx->trace[HTP_MAX_NTHREADS] : NULL; const struct htp_tensor * q = octx->src[0]; const struct htp_tensor * k = octx->src[1]; const struct htp_tensor * v = octx->src[2]; @@ -1293,13 +1312,13 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) { const size_t g_br = hex_align_up(G * Br, HMX_FP16_TILE_N_ROWS); const uint32_t n_kv_blocks = (nek1 + Bc - 1) / Bc; - const bool use_pipeline = (n_kv_blocks >= FA_MIN_KV_BLOCKS && n_threads_init >= 2); + const bool pipeline = (n_kv_blocks >= FA_MIN_KV_BLOCKS && n_threads_init >= 2); // Bypass thread pool dispatch for small prompts/non-pipelined prefill by setting n_threads = 1 - const uint32_t n_threads = use_pipeline ? n_threads_init : 1; + const uint32_t n_threads = pipeline ? n_threads_init : 1; FARF(HIGH, "hmx-fa: neq1=%u nek1=%u DK=%u DV=%u G=%u Br=%zu Bc=%zu g_br=%zu n_kv_blocks=%u pipeline=%d vtcm=%zu", - neq1, nek1, DK, DV, G, Br, Bc, g_br, n_kv_blocks, use_pipeline, vtcm_budget); + neq1, nek1, DK, DV, G, Br, Bc, g_br, n_kv_blocks, pipeline, vtcm_budget); // ======== Build context ======== struct hmx_fa_context factx; @@ -1320,7 +1339,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) { factx.n_kv_blocks = n_kv_blocks; factx.is_q_fp32 = (q->type == HTP_TYPE_F32); factx.is_dst_fp32 = (dst->type == HTP_TYPE_F32); - factx.use_pipeline = use_pipeline; + factx.pipeline = pipeline; factx.mask_broadcast = (mask != NULL && mask->ne[2] == 1); // Extract op parameters (mutable during softcap adjustment, then stored as const in factx) @@ -1386,7 +1405,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) { factx.vtcm_v_fp16[1] = (__fp16 *) vtcm_seq_alloc(&vtcm_cur, v_dma_bytes); factx.vtcm_k_tiles = (__fp16 *) vtcm_seq_alloc(&vtcm_cur, k_tile_bytes); factx.vtcm_v_tiles[0] = (__fp16 *) vtcm_seq_alloc(&vtcm_cur, v_tile_bytes); - if (use_pipeline) { + if (pipeline) { factx.vtcm_v_tiles[1] = (__fp16 *) vtcm_seq_alloc(&vtcm_cur, v_tile_bytes); } else { factx.vtcm_v_tiles[1] = NULL; @@ -1422,19 +1441,6 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) { return HTP_STATUS_OK; } - // Profiling timers - TIMER_DEFINE(total); - TIMER_DEFINE(q_load); - TIMER_DEFINE(kv_dma); - TIMER_DEFINE(k_interleave); - TIMER_DEFINE(v_interleave); - TIMER_DEFINE(qk_dot); - TIMER_DEFINE(softmax); - TIMER_DEFINE(o_update); - TIMER_DEFINE(o_norm); - TIMER_DEFINE(o_store); - - TIMER_START(total); // ======== DMA setup ======== dma_queue * const dma = ctx->dma[0]; @@ -1450,7 +1456,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) { // ======== HMX lock strategy ======== // Pipeline: queue thread auto-acquires HMX lock on first push; released by suspend. // Fallback: main thread holds the lock (original behavior). - if (!factx.use_pipeline) { + if (!factx.pipeline) { HAP_compute_res_hmx_lock(ctx->vtcm_rctx); } @@ -1474,12 +1480,10 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) { const size_t n_row_tiles = g_br_actual / HMX_FP16_TILE_N_ROWS; // ---- Load Q block [g_br, D] -> tiles, interleaving G heads ---- - TIMER_START(q_load); if (n_rows_g < g_br) { hvx_splat_u8_a(factx.vtcm_q_tiles, 0, q_tile_bytes); } fa_phase_q_load(&factx, q, q_start, kv_head, ib3, n_rows_g); - TIMER_STOP(q_load); // ---- Initialize per-block state ---- hvx_splat_u8_a(factx.vtcm_l_vec, 0, col_vec_bytes); @@ -1546,7 +1550,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) { const size_t k_src_stride = size_k_row_padded / sizeof(__fp16); const size_t v_src_stride = size_v_row_padded / sizeof(__fp16); - if (factx.use_pipeline) { + if (factx.pipeline) { // ================================================================== // Pipeline path: HVX phases ‖ HMX queue worker // ================================================================== @@ -1558,10 +1562,8 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) { const size_t n_col_tiles = hmx_ceil_div(kv_rows, HMX_FP16_TILE_N_COLS); // Wait for current KV DMA - TIMER_START(kv_dma); dma_queue_pop(dma); // K dma_queue_pop(dma); // V - TIMER_STOP(kv_dma); // Push mask DMA for this block (single 2D DMA when broadcast) bool has_mask_dma = false; @@ -1583,10 +1585,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) { ou_job.DV = DV; hmx_queue_push(hmx_q, hmx_queue_make_desc(hmx_fa_o_update_worker, &ou_job)); } - - TIMER_START(k_interleave); fa_phase_k_interleave(&factx, kv_rows, k_src_stride, buf_idx); - TIMER_STOP(k_interleave); // ---- Phase 2: qk_dot(blk) on HMX ‖ V_int(blk) + DMA prefetch on HVX ---- qk_job.q_tiles = factx.vtcm_q_tiles; @@ -1597,15 +1596,11 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) { qk_job.n_dot_tiles = DK / 32; qk_job.n_tiles_per_bc = n_tiles_per_bc; qk_job.hmx_scales = factx.vtcm_hmx_scales_qk; - TIMER_START(qk_dot); hmx_queue_push(hmx_q, hmx_queue_make_desc(hmx_fa_qk_dot_worker, &qk_job)); // DMA push next block (non-blocking, before worker_pool) DMA_PREFETCH_KV(kv_blk + 1); - - TIMER_START(v_interleave); fa_phase_v_interleave(&factx, kv_rows, v_src_stride, buf_idx, n_tiles_per_bc); - TIMER_STOP(v_interleave); // Pop and swap previous block's output update (deferred HMX pop) if (kv_blk > 0) { @@ -1615,7 +1610,6 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) { // Pop current block's dot product job hmx_queue_pop(hmx_q); - TIMER_STOP(qk_dot); // ---- Phase 3: softmax(blk) + build_D(blk) | HMX idle ---- // Pop mask DMA before softmax (ensures VTCM buffer is ready) @@ -1641,10 +1635,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) { sargs.mask_vtcm = has_mask_dma ? (const __fp16 *) factx.vtcm_mask_buf : NULL; sargs.mask_vtcm_row_stride = factx.mask_buf_row_stride; sargs.slopes = factx.vtcm_slopes; - - TIMER_START(softmax); fa_phase_softmax_and_build_d(&factx, &sargs, n_row_tiles, n_row_tiles_g_br); - TIMER_STOP(softmax); buf_idx = 1 - buf_idx; } // end KV block loop (pipeline) @@ -1664,11 +1655,8 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) { ou_job.n_row_tiles_g_br = n_row_tiles_g_br; ou_job.n_tiles_per_bc = n_tiles_per_bc; ou_job.DV = DV; - - TIMER_START(o_update); hmx_queue_push(hmx_q, hmx_queue_make_desc(hmx_fa_o_update_worker, &ou_job)); hmx_queue_pop(hmx_q); - TIMER_STOP(o_update); hex_swap_ptr((void **) &o_tile_curr, (void **) &o_tile_prev); } @@ -1683,23 +1671,14 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) { const uint32_t kv_start = kv_blk * Bc; const uint32_t kv_rows = hex_smin(Bc, nek1 - kv_start); const size_t n_col_tiles = hmx_ceil_div(kv_rows, HMX_FP16_TILE_N_COLS); - - TIMER_START(kv_dma); dma_queue_pop(dma); // K dma_queue_pop(dma); // V - TIMER_STOP(kv_dma); bool has_mask_dma = false; MASK_DMA_PUSH(kv_start, kv_rows, has_mask_dma); DMA_PREFETCH_KV(kv_blk + 1); - - // K interleave (multi-thread HVX) - TIMER_START(k_interleave); fa_phase_k_interleave(&factx, kv_rows, k_src_stride, buf_idx); - TIMER_STOP(k_interleave); - // QK dot (inline HMX on main thread) - TIMER_START(qk_dot); { const size_t n_dot_tiles = (size_t) (DK / 32); const __fp16 * restrict q_base = factx.vtcm_q_tiles; @@ -1709,6 +1688,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) { __builtin_assume(n_col_tiles > 0); __builtin_assume(n_dot_tiles > 0); + htp_trace_event_start(tr, HTP_TRACE_EVT_HMX_COMP, HTP_MAX_NTHREADS); Q6_bias_mxmem2_A((void *) factx.vtcm_hmx_scales_qk); for (size_t r = 0; r < n_row_tiles; ++r) { for (size_t c = 0; c < n_col_tiles; ++c) { @@ -1724,8 +1704,8 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) { Q6_mxmem_AR_after_hf(out_tile, 0); } } + htp_trace_event_stop(tr, HTP_TRACE_EVT_HMX_COMP, HTP_MAX_NTHREADS); } - TIMER_STOP(qk_dot); // Pop mask DMA MASK_DMA_POP(has_mask_dma); @@ -1751,21 +1731,9 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) { sargs.mask_vtcm = has_mask_dma ? (const __fp16 *) factx.vtcm_mask_buf : NULL; sargs.mask_vtcm_row_stride = factx.mask_buf_row_stride; sargs.slopes = factx.vtcm_slopes; - - TIMER_START(softmax); fa_phase_softmax_and_build_d(&factx, &sargs, n_row_tiles, n_row_tiles_g_br); - TIMER_STOP(softmax); - - // V interleave (multi-thread HVX) - TIMER_START(v_interleave); - // FIX(v-stride): use n_tiles_per_bc (block-invariant) as V tile layout - // stride to match o_update's v_tile access. Using per-block n_col_tiles - // misplaces DV_tile 1..3 in the last partial KV block. fa_phase_v_interleave(&factx, kv_rows, v_src_stride, buf_idx, n_tiles_per_bc); - TIMER_STOP(v_interleave); - // O update (inline HMX on main thread) - TIMER_START(o_update); { const size_t DV_tiles = (size_t) (DV / 32); const __fp16 * restrict d_base = factx.vtcm_d_tiles; @@ -1777,6 +1745,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) { __builtin_assume(n_col_tiles > 0); __builtin_assume(DV_tiles > 0); + htp_trace_event_start(tr, HTP_TRACE_EVT_HMX_COMP, HTP_MAX_NTHREADS); Q6_bias_mxmem2_A((void *) factx.vtcm_hmx_scales_id); for (size_t r = 0; r < n_row_tiles; ++r) { for (size_t c = 0; c < DV_tiles; ++c) { @@ -1798,21 +1767,20 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) { Q6_mxmem_AR_after_hf(o_tile_out, 0); } } + htp_trace_event_stop(tr, HTP_TRACE_EVT_HMX_COMP, HTP_MAX_NTHREADS); hex_swap_ptr((void **) &o_tile_curr, (void **) &o_tile_prev); } - TIMER_STOP(o_update); buf_idx = 1 - buf_idx; } // end KV block loop (fallback) } // ---- Final normalization: O = diag(1/l) @ O ---- - TIMER_START(o_norm); { fa_build_d_diag_inv_l(&factx, n_row_tiles, n_row_tiles_g_br); // HMX: O_final = diag(1/l) @ O_prev - if (factx.use_pipeline) { + if (factx.pipeline) { on_job.o_curr = o_tile_curr; on_job.o_prev = o_tile_prev; on_job.d_tiles = factx.vtcm_d_tiles; @@ -1830,6 +1798,7 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) { __builtin_assume(n_row_tiles > 0); __builtin_assume(DV_tiles > 0); + htp_trace_event_start(tr, HTP_TRACE_EVT_HMX_COMP, HTP_MAX_NTHREADS); Q6_bias_mxmem2_A((void *) factx.vtcm_hmx_scales_id); for (size_t r = 0; r < n_row_tiles; ++r) { for (size_t c = 0; c < DV_tiles; ++c) { @@ -1842,14 +1811,12 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) { Q6_mxmem_AR_after_hf(o_out, 0); } } + htp_trace_event_stop(tr, HTP_TRACE_EVT_HMX_COMP, HTP_MAX_NTHREADS); } } - TIMER_STOP(o_norm); // ---- Store O block ---- - TIMER_START(o_store); fa_phase_o_store(&factx, dst, o_tile_curr, q_start, kv_head, ib3, n_rows_g); - TIMER_STOP(o_store); #undef MASK_DMA_PUSH #undef MASK_DMA_POP @@ -1859,20 +1826,13 @@ int hmx_flash_attn_ext(struct htp_ops_context * octx) { } // end KV head loop } // end batch loop - if (factx.use_pipeline) { + if (factx.pipeline) { hmx_queue_suspend(ctx->hmx_queue); } else { HAP_compute_res_hmx_unlock(ctx->vtcm_rctx); } - TIMER_STOP(total); -#if defined(ENABLE_PROFILE_TIMERS) - FARF(HIGH, "hmx-fa: %lld us, q_load=%lld kv_dma=%lld k_interleave=%lld v_interleave=%lld", TIMER_US(total), - TIMER_US(q_load), TIMER_US(kv_dma), TIMER_US(k_interleave), TIMER_US(v_interleave)); - FARF(HIGH, " qk_dot=%lld softmax=%lld o_update=%lld o_norm=%lld o_store=%lld", TIMER_US(qk_dot), TIMER_US(softmax), - TIMER_US(o_update), TIMER_US(o_norm), TIMER_US(o_store)); -#endif return HTP_STATUS_OK; } diff --git a/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c b/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c deleted file mode 100644 index dab605210cf6..000000000000 --- a/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c +++ /dev/null @@ -1,2066 +0,0 @@ -#pragma clang diagnostic ignored "-Wgnu-zero-variadic-macro-arguments" -#pragma clang diagnostic ignored "-Wunused-function" -#pragma clang diagnostic ignored "-Wunused-variable" -#pragma clang diagnostic ignored "-Wunused-but-set-variable" - -#include -#include -#include -#include -#include - -#include -#include - -#define GGML_COMMON_DECL_C -#include "ggml-common.h" - -#include "hex-dma.h" -#include "hex-fastdiv.h" -#include "worker-pool.h" - -#include "hvx-utils.h" -#include "hvx-dump.h" -#include "htp-ctx.h" -#include "htp-ops.h" - -#include "hmx-ops.h" -#include "hmx-utils.h" -#include "hmx-queue.h" -#include "hmx-profile.h" - -#include "vtcm-utils.h" - -static const __fp16 q4_0_to_fp16_lut[64] __attribute__((aligned(VLEN))) = { - -8, 0, -7, 0, -6, 0, -5, 0, -4, 0, -3, 0, -2, 0, -1, 0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0, -}; - -static const __fp16 q4_1_to_fp16_lut[64] __attribute__((aligned(VLEN))) = { - 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0, 8, 0, 9, 0, 10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15, 0, -}; - -// MXFP4 dequantization LUT: maps 4-bit index to fp16 mantissa value -// kvalues: 0, 0.5, 1, 1.5, 2, 3, 4, 6, 0, -0.5, -1, -1.5, -2, -3, -4, -6 -static const __fp16 mxfp4_to_fp16_lut[64] __attribute__((aligned(VLEN))) = { - 0, 0, 0.5, 0, 1, 0, 1.5, 0, 2, 0, 3, 0, 4, 0, 6, 0, 0, 0, -0.5, 0, -1, 0, -1.5, 0, -2, 0, -3, 0, -4, 0, -6, 0, -}; - -static const __fp16 iq4_nl_to_fp16_lut[64] __attribute__((aligned(VLEN))) = { - -127, 0, -104, 0, -83, 0, -65, 0, -49, 0, -35, 0, -22, 0, -10, 0, - 1, 0, 13, 0, 25, 0, 38, 0, 53, 0, 69, 0, 89, 0, 113, 0, -}; - -// Scales per x4x2 logical block: 8 × sizeof(__fp16) = 16 bytes -#define HMX_X4X2_SCALES_PER_BLK 8 -#define HMX_X4X2_DBLK_SIZE 16 // 8 * 2 bytes (fp16 scales for Q4_0/Q8_0/IQ4_NL) -#define HMX_X4X2_MXFP4_EBLK_SIZE 8 // 8 * 1 byte (E8M0 scales for MXFP4) - -// Compute the byte stride of one row in x4x2 format. -// Numerically equals ggml_row_size(type, k) when k is 256-aligned, because -// x4x2 packing has the same density as block_q4_0 / block_q8_0. -// Layout per row: [quants: nb*128 (Q4) or nb*256 (Q8)][scales: nb*16 bytes] -// Total per row = nb * (128+16) = 144*nb (Q4) or nb * (256+16) = 272*nb (Q8). -// Callers must ensure k is a multiple of 256 (enforced by proc_hmx_matmul_req). -static inline size_t get_x4x2_row_stride(int weight_type, int k) { - int nb = (k + QK_Q4_0x4x2 - 1) / QK_Q4_0x4x2; - switch (weight_type) { - case HTP_TYPE_Q4_0: - case HTP_TYPE_IQ4_NL: - return (size_t) nb * (QK_Q4_0x4x2 / 2 + HMX_X4X2_DBLK_SIZE); // 144 * nb - case HTP_TYPE_Q4_1: - return (size_t) nb * (QK_Q4_0x4x2 / 2 + 32); // 160 * nb - case HTP_TYPE_Q8_0: - return (size_t) nb * (QK_Q8_0x4x2 + HMX_X4X2_DBLK_SIZE); // 272 * nb - case HTP_TYPE_MXFP4: - return (size_t) nb * (QK_MXFP4x4x2 / 2 + HMX_X4X2_MXFP4_EBLK_SIZE); // 136 * nb - case HTP_TYPE_F16: - return (size_t) k * sizeof(__fp16); - case HTP_TYPE_F32: - return (size_t) k * sizeof(float); - default: - return 0; - } -} - -// --- Overflow-safe arithmetic for VTCM budget calculation --- - -static inline bool hmx_mul_overflow(size_t a, size_t b, size_t *out) { - if (a != 0 && b > SIZE_MAX / a) return true; - *out = a * b; - return false; -} - -static inline bool hmx_add_overflow(size_t a, size_t b, size_t *out) { - if (a > SIZE_MAX - b) return true; - *out = a + b; - return false; -} - -// Search for optimal (mc, nc) chunk sizes within VTCM budget. -// -// VTCM model: nc * per_n_cost + mc * per_m_cost + mc * nc * per_mn_cost + overhead -// -// Minimize ceil(m/mc) * m_block_cost + ceil(n/nc) * n_block_cost. -// All matmul paths repeat weight processing per M-block and activation loading -// per N-block, so discrete block counts drive total overhead. -// Tie-break: when cost is equal, prefer larger mc * nc. -// -// Caller-provided coefficients: -// m_block_cost: penalty per extra M-block (weight redundancy, scales with n). -// n_block_cost: penalty per extra N-block (activation redundancy, scales with m). -// -// Algorithm: nc sweeps from n_max down by 32, analytically solving for mc_max. -// Returns 0 on success, -1 if VTCM is insufficient. -static int hmx_compute_chunks(size_t vtcm_total, - size_t overhead, - size_t per_n_cost, - size_t per_m_cost, - size_t per_mn_cost, - int m, - int n, - size_t m_block_cost, - size_t n_block_cost, - size_t * m_chunk_out, - size_t * n_chunk_out, - size_t * total_out) { - if (m <= 0 || n <= 0) return -1; - if (vtcm_total <= overhead) return -1; - if (per_n_cost == 0 || per_m_cost == 0 || per_mn_cost == 0) return -1; - - const size_t usable = vtcm_total - overhead; - - size_t best_cost = SIZE_MAX; - size_t best_mn = 0; - size_t best_m = 0, best_n = 0; - - const size_t n_max = hex_align_down((size_t)n, HMX_FP16_TILE_N_COLS); - for (size_t nc = n_max; nc >= HMX_FP16_TILE_N_COLS; nc -= HMX_FP16_TILE_N_COLS) { - size_t n_fixed = 0, ncmn = 0, mc_denom = 0; - if (hmx_mul_overflow(nc, per_n_cost, &n_fixed)) continue; - if (n_fixed >= usable) goto next_nc; - - if (hmx_mul_overflow(nc, per_mn_cost, &ncmn)) goto next_nc; - if (hmx_add_overflow(per_m_cost, ncmn, &mc_denom) || mc_denom == 0) goto next_nc; - - { - size_t remain = usable - n_fixed; - size_t mc = remain / mc_denom; - mc = hex_align_down(mc, HMX_FP16_TILE_N_ROWS); - mc = hex_smin(mc, (size_t)m); - - if (mc == 0) { - goto next_nc; - } - - size_t mblocks = ((size_t) m + mc - 1) / mc; - size_t nblocks = ((size_t) n + nc - 1) / nc; - size_t cost = mblocks * m_block_cost + nblocks * n_block_cost; - size_t mn = mc * nc; - if (cost < best_cost || (cost == best_cost && mn > best_mn)) { - best_cost = cost; - best_mn = mn; - best_m = mc; - best_n = nc; - } - } - -next_nc: - if (nc == HMX_FP16_TILE_N_COLS) break; // avoid size_t underflow - } - - if (best_m == 0 || best_n == 0) return -1; - - // Compute exact total (with overflow checks) - size_t t0 = 0, t1 = 0, t2 = 0, mn = 0, total = 0; - if (hmx_mul_overflow(best_n, per_n_cost, &t0)) return -1; - if (hmx_mul_overflow(best_m, per_m_cost, &t1)) return -1; - if (hmx_mul_overflow(best_m, best_n, &mn)) return -1; - if (hmx_mul_overflow(mn, per_mn_cost, &t2)) return -1; - if (hmx_add_overflow(t0, t1, &total)) return -1; - if (hmx_add_overflow(total, t2, &total)) return -1; - if (hmx_add_overflow(total, overhead, &total)) return -1; - - *m_chunk_out = best_m; - *n_chunk_out = best_n; - *total_out = total; - return 0; -} - -// --- x4x2 format dequantizers --- - -// Dequantize one x4x2 Q4_0 group (32 elements from 32 packed bytes) -> 32 FP16 in first 64 bytes. -// In x4x2, sub-blocks 0..3 use lower nibbles, sub-blocks 4..7 use upper nibbles -// of the same 32 packed bytes. -static inline HVX_Vector dequantize_x4x2_q4_0_group_hvx(const uint8_t *packed_32, bool upper_nibbles, const __fp16 *scale, const HVX_Vector vlut_cvt) { - (void)vlut_cvt; - HVX_Vector vq = hvx_vmemu(packed_32); - const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F); - const HVX_Vector i8 = Q6_Vb_vsplat_R(8); - HVX_Vector v_scales = hvx_vec_repl_f16(hvx_vmemu(scale)); - - HVX_Vector v_quants = Q6_Vub_vlsr_VubR(vq, 4 * upper_nibbles); - v_quants = Q6_V_vand_VV(v_quants, mask_h4); - - HVX_Vector v_int8 = Q6_Vb_vsub_VbVb(v_quants, i8); - HVX_Vector v0 = Q6_V_lo_W(Q6_Wh_vunpack_Vb(v_int8)); - HVX_Vector v_hf = Q6_Vhf_equals_Vh(v0); - - return Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_hf, v_scales)); -} - -// Batch-dequantize 4 contiguous x4x2 Q4_0 groups (4x32 = 128 packed bytes) using -// full HVX vector width. -// Output: vector_x2 each hold 32 FP16 values in the first 64 bytes. -static inline HVX_Vector_x2 dequantize_x4x2_q4_0_x4groups_hvx( - const uint8_t *packed_128, bool upper_nibbles, - const __fp16 *scales_4, const HVX_Vector vlut_cvt) { - (void)vlut_cvt; - HVX_Vector vq = hvx_vmemu(packed_128); - const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F); - const HVX_Vector i8 = Q6_Vb_vsplat_R(8); - HVX_Vector v_quants = Q6_Vub_vlsr_VubR(vq, 4 * upper_nibbles); - v_quants = Q6_V_vand_VV(v_quants, mask_h4); - - HVX_Vector v_int8 = Q6_Vb_vsub_VbVb(v_quants, i8); - - HVX_VectorPair vp_int16 = Q6_Wh_vunpack_Vb(v_int8); - HVX_Vector v_lo = Q6_V_lo_W(vp_int16); - HVX_Vector v_hi = Q6_V_hi_W(vp_int16); - - v_lo = Q6_Vhf_equals_Vh(v_lo); - v_hi = Q6_Vhf_equals_Vh(v_hi); - - HVX_Vector vscale = hvx_vmemu(scales_4); - HVX_Vector v_sc01 = hvx_vec_repl_2x_f16(vscale); - HVX_Vector v_sc23 = hvx_vec_repl_2x_f16(Q6_V_vror_VR(vscale, 4)); - - v_lo = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_lo, v_sc01)); - v_hi = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_hi, v_sc23)); - - HVX_Vector_x2 r = { v_lo, v_hi }; - return r; -} - -static inline HVX_Vector dequantize_x4x2_q4_1_group_hvx(const uint8_t *packed_32, bool upper_nibbles, const __fp16 *scale_offset, const HVX_Vector vlut_cvt) { - (void)vlut_cvt; - HVX_Vector vq = hvx_vmemu(packed_32); - const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F); - HVX_Vector v_dm = hvx_vmemu(scale_offset); - HVX_Vector v_scales = hvx_vec_repl_f16(v_dm); - HVX_Vector v_offsets = hvx_vec_repl_f16(Q6_V_vror_VR(v_dm, 2)); - - HVX_Vector v_quants = Q6_Vub_vlsr_VubR(vq, 4 * upper_nibbles); - v_quants = Q6_V_vand_VV(v_quants, mask_h4); - - HVX_Vector v0 = Q6_V_lo_W(Q6_Wh_vunpack_Vb(v_quants)); - HVX_Vector v_hf = Q6_Vhf_equals_Vh(v0); - - return Q6_Vhf_equals_Vqf16(Q6_Vqf16_vadd_Vqf16Vhf(Q6_Vqf16_vmpy_VhfVhf(v_hf, v_scales), v_offsets)); -} - -static inline HVX_Vector_x2 dequantize_x4x2_q4_1_x4groups_hvx( - const uint8_t *packed_128, bool upper_nibbles, - const __fp16 *scales_offsets_4, const HVX_Vector vlut_cvt) { - (void)vlut_cvt; - HVX_Vector vq = hvx_vmemu(packed_128); - const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F); - HVX_Vector v_quants = Q6_Vub_vlsr_VubR(vq, 4 * upper_nibbles); - v_quants = Q6_V_vand_VV(v_quants, mask_h4); - - HVX_VectorPair vp_int16 = Q6_Wh_vunpack_Vb(v_quants); - HVX_Vector v_lo = Q6_V_lo_W(vp_int16); - HVX_Vector v_hi = Q6_V_hi_W(vp_int16); - - v_lo = Q6_Vhf_equals_Vh(v_lo); - v_hi = Q6_Vhf_equals_Vh(v_hi); - - HVX_Vector vscale_offset = hvx_vmemu(scales_offsets_4); - HVX_VectorPair dm_deal = Q6_W_vdeal_VVR(vscale_offset, vscale_offset, -2); - HVX_Vector vd = Q6_V_lo_W(dm_deal); - HVX_Vector vm = Q6_V_hi_W(dm_deal); - - HVX_Vector v_sc01 = hvx_vec_repl_2x_f16(vd); - HVX_Vector v_sc23 = hvx_vec_repl_2x_f16(Q6_V_vror_VR(vd, 4)); - - HVX_Vector v_os01 = hvx_vec_repl_2x_f16(vm); - HVX_Vector v_os23 = hvx_vec_repl_2x_f16(Q6_V_vror_VR(vm, 4)); - - v_lo = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vadd_Vqf16Vhf(Q6_Vqf16_vmpy_VhfVhf(v_lo, v_sc01), v_os01)); - v_hi = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vadd_Vqf16Vhf(Q6_Vqf16_vmpy_VhfVhf(v_hi, v_sc23), v_os23)); - - HVX_Vector_x2 r = { v_lo, v_hi }; - return r; -} - -// LUT-based dequantizers for non-linear IQ4_NL format. -static inline HVX_Vector dequantize_x4x2_iq4_nl_group_hvx(const uint8_t *packed_32, bool upper_nibbles, const __fp16 *scale, const HVX_Vector vlut_cvt) { - HVX_Vector vq = hvx_vmemu(packed_32); - const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F); - HVX_Vector v_scales = hvx_vec_repl_f16(hvx_vmemu(scale)); - HVX_Vector v_quants = Q6_Vub_vlsr_VubR(vq, 4 * upper_nibbles); - v_quants = Q6_V_vand_VV(v_quants, mask_h4); - v_quants = Q6_Vb_vshuff_Vb(v_quants); - HVX_VectorPair vp = Q6_Wh_vlut16_VbVhR(v_quants, vlut_cvt, 0); - HVX_Vector v_hf = Q6_V_lo_W(vp); - - return Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_hf, v_scales)); -} - -static inline HVX_Vector_x2 dequantize_x4x2_iq4_nl_x4groups_hvx( - const uint8_t *packed_128, bool upper_nibbles, - const __fp16 *scales_4, const HVX_Vector vlut_cvt) { - HVX_Vector vq = hvx_vmemu(packed_128); - const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F); - HVX_Vector v_quants = Q6_Vub_vlsr_VubR(vq, 4 * upper_nibbles); - v_quants = Q6_V_vand_VV(v_quants, mask_h4); - - v_quants = Q6_Vb_vshuff_Vb(v_quants); - - HVX_VectorPair vp = Q6_Wh_vlut16_VbVhR(v_quants, vlut_cvt, 0); - HVX_Vector v_lo = Q6_V_lo_W(vp); - HVX_Vector v_hi = Q6_V_hi_W(vp); - - HVX_Vector vscale = hvx_vmemu(scales_4); - HVX_Vector v_sc01 = hvx_vec_repl_2x_f16(vscale); - HVX_Vector v_sc23 = hvx_vec_repl_2x_f16(Q6_V_vror_VR(vscale, 4)); - - v_lo = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_lo, v_sc01)); - v_hi = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_hi, v_sc23)); - - HVX_Vector_x2 r = { v_lo, v_hi }; - return r; -} - -// Dequantize one x4x2 Q8_0 group (32 int8 quants) -> 32 FP16 in first 64 bytes. -static inline HVX_Vector dequantize_x4x2_q8_0_group_hvx(const int8_t *quants_32, const __fp16 *scale) { - HVX_Vector vq = hvx_vmemu(quants_32); - HVX_Vector v_scales = hvx_vec_repl_f16(hvx_vmemu(scale)); - HVX_Vector v0 = Q6_V_lo_W(Q6_Wh_vunpack_Vb(vq)); - HVX_Vector v_hf = Q6_Vhf_equals_Vh(v0); - return Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_hf, v_scales)); -} - -// --- MXFP4 E8M0 scale conversion and dequantization --- -// -// HVX batch-convert 8 E8M0 bytes (one x4x2 block's scales) to __fp16[8] on stack. -// Scalar loads from the stack array execute on the scalar pipeline, in parallel -// with HVX vlut16/vmpy/vscatter — freeing HVX slots in the hot loop. -// Arithmetic: fp16_bits = clamp(e - 112, 0, 30) << 10 -// e=0..112 -> 0 (underflow), e=113..142 -> valid fp16, e>=143 -> clamped to 2^15. - -typedef struct { - __fp16 v[8] __attribute__((aligned(16))); -} mxfp4_scales_t; - -static inline mxfp4_scales_t mxfp4_convert_scales(const uint8_t * e8m0_8) { - mxfp4_scales_t s; - HVX_Vector v = hvx_vmemu(e8m0_8); - HVX_Vector vh = Q6_V_lo_W(Q6_Wuh_vunpack_Vub(v)); - vh = Q6_Vh_vsub_VhVh(vh, Q6_Vh_vsplat_R(112)); - vh = Q6_Vh_vmax_VhVh(vh, Q6_V_vzero()); - vh = Q6_Vh_vmin_VhVh(vh, Q6_Vh_vsplat_R(30)); - vh = Q6_Vh_vasl_VhR(vh, 10); - hvx_vec_store_u(s.v, 16, vh); - return s; -} - -static inline HVX_Vector mxfp4_extract_splat(mxfp4_scales_t scales, int idx) { - return hvx_vec_splat_f16(scales.v[idx]); -} - -// Dequantize one x4x2 MXFP4 group (32 elements from 32 packed bytes) -> 32 FP16. -static inline HVX_Vector dequantize_x4x2_mxfp4_group_hvx(const uint8_t * packed_32, - bool upper_nibbles, - int sub_blk, - const HVX_Vector vlut_cvt, - mxfp4_scales_t scales) { - HVX_Vector vq = hvx_vmemu(packed_32); - const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F); - HVX_Vector v_quants = upper_nibbles ? Q6_Vub_vlsr_VubR(vq, 4) : vq; - v_quants = Q6_V_vand_VV(v_quants, mask_h4); - - HVX_Vector v_sc = mxfp4_extract_splat(scales, sub_blk); - - v_quants = Q6_Vb_vshuff_Vb(v_quants); - HVX_VectorPair vp = Q6_Wh_vlut16_VbVhR(v_quants, vlut_cvt, 0); - HVX_Vector v_hf = Q6_V_lo_W(vp); - - return Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_hf, v_sc)); -} - -// Batch-dequantize 4 contiguous x4x2 MXFP4 groups (4x32 = 128 packed bytes). -static inline HVX_Vector_x4 dequantize_x4x2_mxfp4_x4groups_hvx(const uint8_t * packed_128, - bool upper_nibbles, - int sub_blk_base, - const HVX_Vector vlut_cvt, - mxfp4_scales_t scales) { - HVX_Vector vq = hvx_vmemu(packed_128); - const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F); - HVX_Vector v_quants = upper_nibbles ? Q6_Vub_vlsr_VubR(vq, 4) : vq; - v_quants = Q6_V_vand_VV(v_quants, mask_h4); - - v_quants = Q6_Vb_vshuff_Vb(v_quants); - - HVX_VectorPair vp = Q6_Wh_vlut16_VbVhR(v_quants, vlut_cvt, 0); - HVX_Vector v_lo = Q6_V_lo_W(vp); - HVX_Vector v_hi = Q6_V_hi_W(vp); - - HVX_VectorPred q64 = Q6_Q_vsetq_R(64); - HVX_Vector v_sc01 = Q6_V_vmux_QVV(q64, mxfp4_extract_splat(scales, sub_blk_base + 0), - mxfp4_extract_splat(scales, sub_blk_base + 1)); - HVX_Vector v_sc23 = Q6_V_vmux_QVV(q64, mxfp4_extract_splat(scales, sub_blk_base + 2), - mxfp4_extract_splat(scales, sub_blk_base + 3)); - - v_lo = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_lo, v_sc01)); - v_hi = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_hi, v_sc23)); - - HVX_Vector_x4 r = { v_lo, Q6_V_vror_VR(v_lo, 64), v_hi, Q6_V_vror_VR(v_hi, 64) }; - return r; -} - -typedef struct { - __fp16 *dst; - const uint8_t *src; - int n_cols; - int k_block; - size_t row_stride; - int weight_type; - int n_tot_tiles; - int n_tiles_per_task; - int n_tasks; - int n_k_tiles; - struct fastdiv_values n_k_tiles_div; -} x4x2_dequantize_state_t; - -// Dequantize a tile range from x4x2 weight data (already in VTCM) to tile-major FP16. -// Input: vtcm_src has n_cols rows of x4x2 data, each row_stride bytes. -// Output: vtcm_dst in tile-major FP16 layout. - -#define DEFINE_DEQUANTIZE_Q4_TASK(suffix, lut_name, helper_prefix, dblk_size, scale_step) \ -static void dequantize_x4x2_weight_to_fp16_tiles_task_##suffix( \ - const x4x2_dequantize_state_t *state, \ - int start_tile, int end_tile) { \ - \ - const int n_k_tiles = state->n_k_tiles; \ - const int qrow_size = (unsigned)state->k_block / 2; \ - const struct fastdiv_values n_k_tiles_div = state->n_k_tiles_div; \ - const HVX_Vector vlut_cvt = hvx_vmem(lut_name); \ - \ - const HVX_Vector v_scat_base = hvx_vmem(hmx_transpose_scatter_offsets); \ - const HVX_Vector v_scat_step = Q6_V_vsplat_R(4); \ - const HVX_VectorPred q_mask64 = Q6_Q_vsetq_R(64); \ - \ - unsigned ct = fastdiv((unsigned)start_tile, &n_k_tiles_div); \ - unsigned kt = fastmodulo((unsigned)start_tile, n_k_tiles, &n_k_tiles_div); \ - \ - for (unsigned t = start_tile; t < (unsigned)end_tile; ) { \ - if (kt >= (unsigned)n_k_tiles) { kt = 0; ct++; } \ - \ - if ((kt % 4 == 0) && (t + 4 <= (unsigned)end_tile) && (fastdiv(t + 3, &n_k_tiles_div) == ct)) { \ - unsigned blk_idx = ((kt * 32) / QK_Q4_0x4x2); \ - unsigned sub_blk_base = ((kt * 32) % QK_Q4_0x4x2) / 32; \ - bool upper = (sub_blk_base >= 4); \ - unsigned packed_off = blk_idx * (QK_Q4_0x4x2 / 2); \ - unsigned scale_off = qrow_size + blk_idx * (dblk_size) + sub_blk_base * (scale_step); \ - \ - __fp16 *tile_bases[4]; \ - for (unsigned g = 0; g < 4; g++) { \ - tile_bases[g] = state->dst + (t + g) * HMX_FP16_TILE_N_ELMS; \ - } \ - \ - HVX_Vector v_off = v_scat_base; \ - unsigned row_offset = ct * HMX_FP16_TILE_N_COLS * state->row_stride; \ - \ - for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2) { \ - const uint8_t *r0 = state->src + row_offset; row_offset += state->row_stride; \ - const uint8_t *r1 = state->src + row_offset; row_offset += state->row_stride; \ - \ - HVX_Vector_x2 dv0 = dequantize_x4x2_##helper_prefix##_x4groups_hvx( \ - r0 + packed_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt); \ - Q6_vscatter_RMVwV((size_t)tile_bases[0], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv0.v[0]); \ - Q6_vscatter_RMVwV((size_t)tile_bases[2], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv0.v[1]); \ - v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step); \ - \ - HVX_Vector_x2 dv1 = dequantize_x4x2_##helper_prefix##_x4groups_hvx( \ - r1 + packed_off, upper, (const __fp16 *)(r1 + scale_off), vlut_cvt); \ - Q6_vscatter_RMVwV((size_t)tile_bases[0], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv1.v[0]); \ - Q6_vscatter_RMVwV((size_t)tile_bases[2], 2 * HMX_FP16_TILE_SIZE - 1, v_off, dv1.v[1]); \ - v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step); \ - } \ - \ - for (int g = 0; g < 4; g++) { (void) *(volatile HVX_Vector *)(tile_bases[g]); } \ - t += 4; kt += 4; \ - continue; \ - } \ - \ - __fp16 *tile_base = state->dst + t * HMX_FP16_TILE_N_ELMS; \ - { \ - unsigned blk_idx = (kt * 32) / QK_Q4_0x4x2; \ - unsigned sub_blk = ((kt * 32) % QK_Q4_0x4x2) / 32; \ - bool upper = (sub_blk >= 4); \ - unsigned byte_off = blk_idx * (QK_Q4_0x4x2 / 2) + (upper ? (sub_blk - 4) : sub_blk) * 32; \ - unsigned scale_off = qrow_size + blk_idx * (dblk_size) + sub_blk * (scale_step); \ - \ - HVX_Vector v_off = v_scat_base; \ - unsigned row_offset = ct * HMX_FP16_TILE_N_COLS * state->row_stride; \ - unsigned row1 = ct * HMX_FP16_TILE_N_COLS + 1; \ - \ - for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2, row1 += 2) { \ - const uint8_t *r0 = state->src + row_offset; row_offset += state->row_stride; \ - const uint8_t *r1 = state->src + row_offset; row_offset += state->row_stride; \ - \ - HVX_Vector v0 = dequantize_x4x2_##helper_prefix##_group_hvx( \ - r0 + byte_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt); \ - HVX_Vector v1 = (row1 < (unsigned)state->n_cols) \ - ? dequantize_x4x2_##helper_prefix##_group_hvx( \ - r1 + byte_off, upper, (const __fp16 *)(r1 + scale_off), vlut_cvt) \ - : Q6_V_vzero(); \ - \ - Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_base, HMX_FP16_TILE_SIZE - 1, v_off, v0); \ - v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step); \ - Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_base, HMX_FP16_TILE_SIZE - 1, v_off, v1); \ - v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step); \ - } \ - (void) *(volatile HVX_Vector *)(tile_base); \ - } \ - ++t; ++kt; \ - } \ - \ - if (start_tile < end_tile) { \ - (void) *(volatile HVX_Vector *)(state->dst + (end_tile - 1) * HMX_FP16_TILE_N_ELMS); \ - } \ -} \ - \ -static void dequantize_x4x2_worker_loop_##suffix(unsigned int n, unsigned int i, void *data) { \ - x4x2_dequantize_state_t *state = (x4x2_dequantize_state_t *)data; \ - for (unsigned int task_id = i; task_id < (unsigned int)state->n_tasks; task_id += n) { \ - int start = task_id * state->n_tiles_per_task; \ - int end = hex_smin(start + state->n_tiles_per_task, state->n_tot_tiles); \ - dequantize_x4x2_weight_to_fp16_tiles_task_##suffix(state, start, end); \ - } \ -} - -DEFINE_DEQUANTIZE_Q4_TASK(q4_0, q4_0_to_fp16_lut, q4_0, HMX_X4X2_DBLK_SIZE, (int)sizeof(__fp16)) -DEFINE_DEQUANTIZE_Q4_TASK(q4_1, q4_1_to_fp16_lut, q4_1, 32, 4) -DEFINE_DEQUANTIZE_Q4_TASK(iq4_nl, iq4_nl_to_fp16_lut, iq4_nl, HMX_X4X2_DBLK_SIZE, (int)sizeof(__fp16)) - -static void dequantize_x4x2_weight_to_fp16_tiles_task_mxfp4( - const x4x2_dequantize_state_t *state, - int start_tile, int end_tile) { - - const int n_k_tiles = state->n_k_tiles; - const int qrow_size = (unsigned)state->k_block / 2; - const struct fastdiv_values n_k_tiles_div = state->n_k_tiles_div; - const HVX_Vector vlut_cvt = hvx_vmem(mxfp4_to_fp16_lut); - - const HVX_Vector v_scat_base = hvx_vmem(hmx_transpose_scatter_offsets); - const HVX_Vector v_scat_step = Q6_V_vsplat_R(4); - const HVX_VectorPred q_mask64 = Q6_Q_vsetq_R(64); - - unsigned ct = fastdiv((unsigned)start_tile, &n_k_tiles_div); - unsigned kt = fastmodulo((unsigned)start_tile, n_k_tiles, &n_k_tiles_div); - - for (unsigned t = start_tile; t < (unsigned)end_tile; ) { - if (kt >= (unsigned)n_k_tiles) { kt = 0; ct++; } - - // Batch-4 fast path for MXFP4 - if ((kt % 4 == 0) && (t + 4 <= (unsigned)end_tile) && (fastdiv(t + 3, &n_k_tiles_div) == ct)) { - int blk_idx = (kt * 32) / QK_MXFP4x4x2; - int sub_blk_base = ((kt * 32) % QK_MXFP4x4x2) / 32; - bool upper = (sub_blk_base >= 4); - int packed_off = blk_idx * (QK_MXFP4x4x2 / 2); - int e8m0_blk_off = qrow_size + blk_idx * HMX_X4X2_MXFP4_EBLK_SIZE; - - __fp16 * tile_bases[4]; - for (int g = 0; g < 4; g++) { - tile_bases[g] = state->dst + (t + g) * HMX_FP16_TILE_N_ELMS; - } - - HVX_Vector v_off = v_scat_base; - for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2) { - int row0 = ct * HMX_FP16_TILE_N_COLS + r; - int row1 = row0 + 1; - const uint8_t * r0 = state->src + row0 * state->row_stride; - const uint8_t * r1 = state->src + row1 * state->row_stride; - - mxfp4_scales_t r0_e8 = mxfp4_convert_scales(r0 + e8m0_blk_off); - - HVX_Vector_x4 dv0, dv1; - dv0 = dequantize_x4x2_mxfp4_x4groups_hvx(r0 + packed_off, upper, sub_blk_base, vlut_cvt, r0_e8); - if (row1 < state->n_cols) { - mxfp4_scales_t r1_e8 = mxfp4_convert_scales(r1 + e8m0_blk_off); - dv1 = dequantize_x4x2_mxfp4_x4groups_hvx(r1 + packed_off, upper, sub_blk_base, vlut_cvt, r1_e8); - } else { - dv1.v[0] = dv1.v[1] = dv1.v[2] = dv1.v[3] = Q6_V_vzero(); - } - - for (int g = 0; g < 4; g++) { - Q6_vscatter_QRMVwV(q_mask64, (size_t) tile_bases[g], HMX_FP16_TILE_SIZE - 1, v_off, dv0.v[g]); - } - v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step); - for (int g = 0; g < 4; g++) { - Q6_vscatter_QRMVwV(q_mask64, (size_t) tile_bases[g], HMX_FP16_TILE_SIZE - 1, v_off, dv1.v[g]); - } - v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step); - } - - for (int g = 0; g < 4; g++) { - (void) *(volatile HVX_Vector *) (tile_bases[g]); - } - - t += 4; kt += 4; - continue; - } - - // Single-tile fallback - __fp16 *tile_base = state->dst + t * HMX_FP16_TILE_N_ELMS; - { - int blk_idx = (kt * 32) / QK_MXFP4x4x2; - int sub_blk = ((kt * 32) % QK_MXFP4x4x2) / 32; - bool upper = (sub_blk >= 4); - int byte_off = blk_idx * (QK_MXFP4x4x2 / 2) + (upper ? (sub_blk - 4) : sub_blk) * 32; - int e8m0_blk_off = qrow_size + blk_idx * HMX_X4X2_MXFP4_EBLK_SIZE; - - HVX_Vector v_off = v_scat_base; - for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2) { - int row0 = ct * HMX_FP16_TILE_N_COLS + r; - int row1 = row0 + 1; - - const uint8_t * r0 = state->src + row0 * state->row_stride; - const uint8_t * r1 = state->src + row1 * state->row_stride; - - mxfp4_scales_t r0_e8 = mxfp4_convert_scales(r0 + e8m0_blk_off); - - HVX_Vector v0 = dequantize_x4x2_mxfp4_group_hvx(r0 + byte_off, upper, sub_blk, vlut_cvt, r0_e8); - HVX_Vector v1; - if (row1 < state->n_cols) { - mxfp4_scales_t r1_e8 = mxfp4_convert_scales(r1 + e8m0_blk_off); - v1 = dequantize_x4x2_mxfp4_group_hvx(r1 + byte_off, upper, sub_blk, vlut_cvt, r1_e8); - } else { - v1 = Q6_V_vzero(); - } - - Q6_vscatter_QRMVwV(q_mask64, (size_t) tile_base, HMX_FP16_TILE_SIZE - 1, v_off, v0); - v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step); - Q6_vscatter_QRMVwV(q_mask64, (size_t) tile_base, HMX_FP16_TILE_SIZE - 1, v_off, v1); - v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step); - } - (void) *(volatile HVX_Vector *) (tile_base); - } - ++t; ++kt; - } - - if (start_tile < end_tile) { - (void) *(volatile HVX_Vector *)(state->dst + (end_tile - 1) * HMX_FP16_TILE_N_ELMS); - } -} - -static void dequantize_x4x2_worker_loop_mxfp4(unsigned int n, unsigned int i, void *data) { - x4x2_dequantize_state_t *state = (x4x2_dequantize_state_t *)data; - for (unsigned int task_id = i; task_id < (unsigned int)state->n_tasks; task_id += n) { - int start = task_id * state->n_tiles_per_task; - int end = hex_smin(start + state->n_tiles_per_task, state->n_tot_tiles); - dequantize_x4x2_weight_to_fp16_tiles_task_mxfp4(state, start, end); - } -} - -static void dequantize_x4x2_weight_to_fp16_tiles_task_q8_0( - const x4x2_dequantize_state_t *state, - int start_tile, int end_tile) { - - const int n_k_tiles = state->n_k_tiles; - const int qrow_size = state->k_block; - const struct fastdiv_values n_k_tiles_div = state->n_k_tiles_div; - - const HVX_Vector v_scat_base = hvx_vmem(hmx_transpose_scatter_offsets); - const HVX_Vector v_scat_step = Q6_V_vsplat_R(4); - const HVX_VectorPred q_mask64 = Q6_Q_vsetq_R(64); - - unsigned ct = fastdiv((unsigned)start_tile, &n_k_tiles_div); - unsigned kt = fastmodulo((unsigned)start_tile, n_k_tiles, &n_k_tiles_div); - - for (unsigned t = start_tile; t < (unsigned)end_tile; ) { - if (kt >= (unsigned)n_k_tiles) { kt = 0; ct++; } - - __fp16 *tile_base = state->dst + t * HMX_FP16_TILE_N_ELMS; - { - int blk_idx = (kt * 32) / QK_Q8_0x4x2; - int sub_blk = ((kt * 32) % QK_Q8_0x4x2) / 32; - int byte_off = blk_idx * QK_Q8_0x4x2 + sub_blk * 32; - int scale_off = qrow_size + blk_idx * HMX_X4X2_DBLK_SIZE + sub_blk * (int)sizeof(__fp16); - - HVX_Vector v_off = v_scat_base; - for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2) { - int row0 = ct * HMX_FP16_TILE_N_COLS + r; - int row1 = row0 + 1; - - const uint8_t *r0 = state->src + row0 * state->row_stride; - const uint8_t *r1 = state->src + row1 * state->row_stride; - - HVX_Vector v0 = dequantize_x4x2_q8_0_group_hvx((const int8_t *)(r0 + byte_off), (const __fp16 *)(r0 + scale_off)); - HVX_Vector v1 = (row1 < state->n_cols) ? dequantize_x4x2_q8_0_group_hvx((const int8_t *)(r1 + byte_off), (const __fp16 *)(r1 + scale_off)) : Q6_V_vzero(); - - Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_base, HMX_FP16_TILE_SIZE - 1, v_off, v0); - v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step); - Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_base, HMX_FP16_TILE_SIZE - 1, v_off, v1); - v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step); - } - (void) *(volatile HVX_Vector *)(tile_base); - } - ++t; ++kt; - } - - if (start_tile < end_tile) { - (void) *(volatile HVX_Vector *)(state->dst + (end_tile - 1) * HMX_FP16_TILE_N_ELMS); - } -} - -static void dequantize_x4x2_worker_loop_q8_0(unsigned int n, unsigned int i, void *data) { - x4x2_dequantize_state_t *state = (x4x2_dequantize_state_t *)data; - for (unsigned int task_id = i; task_id < (unsigned int)state->n_tasks; task_id += n) { - int start = task_id * state->n_tiles_per_task; - int end = hex_smin(start + state->n_tiles_per_task, state->n_tot_tiles); - dequantize_x4x2_weight_to_fp16_tiles_task_q8_0(state, start, end); - } -} - -static void convert_f16_weight_to_fp16_tiles_task( - const x4x2_dequantize_state_t *state, - int start_tile, int end_tile) { - - const int n_k_tiles = state->n_k_tiles; - const struct fastdiv_values n_k_tiles_div = state->n_k_tiles_div; - - const HVX_Vector v_scat_base = hvx_vmem(hmx_transpose_scatter_offsets); - const HVX_Vector v_scat_step = Q6_V_vsplat_R(4); - const HVX_VectorPred q_mask64 = Q6_Q_vsetq_R(64); - - unsigned ct = fastdiv((unsigned)start_tile, &n_k_tiles_div); - unsigned kt = fastmodulo((unsigned)start_tile, n_k_tiles, &n_k_tiles_div); - - for (unsigned t = start_tile; t < (unsigned)end_tile; ) { - if (kt >= (unsigned)n_k_tiles) { kt = 0; ct++; } - - __fp16 *tile_base = state->dst + t * HMX_FP16_TILE_N_ELMS; - { - int byte_off = kt * 32 * sizeof(__fp16); - - HVX_Vector v_off = v_scat_base; - for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2) { - int row0 = ct * HMX_FP16_TILE_N_COLS + r; - int row1 = row0 + 1; - - const uint8_t *r0 = state->src + row0 * state->row_stride; - const uint8_t *r1 = state->src + row1 * state->row_stride; - - HVX_Vector v0 = hvx_vmemu((const __fp16 *)(r0 + byte_off)); - HVX_Vector v1 = (row1 < state->n_cols) ? hvx_vmemu((const __fp16 *)(r1 + byte_off)) : Q6_V_vzero(); - - Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_base, HMX_FP16_TILE_SIZE - 1, v_off, v0); - v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step); - Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_base, HMX_FP16_TILE_SIZE - 1, v_off, v1); - v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step); - } - (void) *(volatile HVX_Vector *)(tile_base); - } - ++t; ++kt; - } - - if (start_tile < end_tile) { - (void) *(volatile HVX_Vector *)(state->dst + (end_tile - 1) * HMX_FP16_TILE_N_ELMS); - } -} - -static void convert_f16_worker_loop(unsigned int n, unsigned int i, void *data) { - x4x2_dequantize_state_t *state = (x4x2_dequantize_state_t *)data; - for (unsigned int task_id = i; task_id < (unsigned int)state->n_tasks; task_id += n) { - int start = task_id * state->n_tiles_per_task; - int end = hex_smin(start + state->n_tiles_per_task, state->n_tot_tiles); - convert_f16_weight_to_fp16_tiles_task(state, start, end); - } -} - -static void quantize_f32_weight_to_fp16_tiles_task( - const x4x2_dequantize_state_t *state, - int start_tile, int end_tile) { - - const int n_k_tiles = state->n_k_tiles; - const struct fastdiv_values n_k_tiles_div = state->n_k_tiles_div; - - const HVX_Vector v_scat_base = hvx_vmem(hmx_transpose_scatter_offsets); - const HVX_Vector v_scat_step = Q6_V_vsplat_R(4); - const HVX_VectorPred q_mask64 = Q6_Q_vsetq_R(64); - - unsigned ct = fastdiv((unsigned)start_tile, &n_k_tiles_div); - unsigned kt = fastmodulo((unsigned)start_tile, n_k_tiles, &n_k_tiles_div); - - for (unsigned t = start_tile; t < (unsigned)end_tile; ) { - if (kt >= (unsigned)n_k_tiles) { kt = 0; ct++; } - - __fp16 *tile_base = state->dst + t * HMX_FP16_TILE_N_ELMS; - { - int byte_off = kt * 32 * sizeof(float); - - HVX_Vector v_off = v_scat_base; - for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2) { - int row0 = ct * HMX_FP16_TILE_N_COLS + r; - int row1 = row0 + 1; - - const uint8_t *r0 = state->src + row0 * state->row_stride; - const uint8_t *r1 = state->src + row1 * state->row_stride; - - HVX_Vector v0_f32 = hvx_vmemu((const float *)(r0 + byte_off)); - HVX_Vector v1_f32 = (row1 < state->n_cols) ? hvx_vmemu((const float *)(r1 + byte_off)) : Q6_V_vzero(); - - HVX_Vector v_out = hvx_vec_f32_to_f16(v0_f32, v1_f32); - - Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_base, HMX_FP16_TILE_SIZE - 1, v_off, v_out); - v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step); - - HVX_Vector v_out_hi = Q6_V_vror_VR(v_out, 64); - Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_base, HMX_FP16_TILE_SIZE - 1, v_off, v_out_hi); - v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step); - } - (void) *(volatile HVX_Vector *)(tile_base); - } - ++t; ++kt; - } - - if (start_tile < end_tile) { - (void) *(volatile HVX_Vector *)(state->dst + (end_tile - 1) * HMX_FP16_TILE_N_ELMS); - } -} - -static void quantize_f32_worker_loop(unsigned int n, unsigned int i, void *data) { - x4x2_dequantize_state_t *state = (x4x2_dequantize_state_t *)data; - for (unsigned int task_id = i; task_id < (unsigned int)state->n_tasks; task_id += n) { - int start = task_id * state->n_tiles_per_task; - int end = hex_smin(start + state->n_tiles_per_task, state->n_tot_tiles); - quantize_f32_weight_to_fp16_tiles_task(state, start, end); - } -} - - -static void dequantize_x4x2_weight_chunk_to_fp16_tiles( - struct htp_context *ctx, __fp16 *vtcm_dst, - const void *vtcm_src, int n_cols, int k_block, - size_t row_stride, int weight_type, - int n_k_tiles, struct fastdiv_values n_k_tiles_div, - worker_callback_t dequant_worker_fn, int n_threads) { - - assert(n_cols % HMX_FP16_TILE_N_COLS == 0); - assert(k_block % HMX_FP16_TILE_N_COLS == 0); - - size_t n_col_tiles = n_cols / HMX_FP16_TILE_N_COLS; - size_t n_tot_tiles = n_col_tiles * n_k_tiles; - - size_t n_tiles_per_task = (n_threads == 1) ? n_tot_tiles : hmx_ceil_div(n_tot_tiles, n_threads); - - x4x2_dequantize_state_t state; - state.n_tasks = (n_tot_tiles + n_tiles_per_task - 1) / n_tiles_per_task; - state.n_tot_tiles = n_tot_tiles; - state.n_tiles_per_task = n_tiles_per_task; - state.dst = vtcm_dst; - state.src = (const uint8_t *)vtcm_src; - state.n_cols = n_cols; - state.k_block = k_block; - state.row_stride = row_stride; - state.weight_type = weight_type; - state.n_k_tiles = n_k_tiles; - state.n_k_tiles_div = n_k_tiles_div; - - if (state.n_tasks == 1 || n_threads == 1) { - dequant_worker_fn(1, 0, &state); - } else { - worker_pool_run_func(ctx->worker_pool, dequant_worker_fn, &state, n_threads); - } -} - -// --- End x4x2 dequantizers --- - -#pragma clang diagnostic ignored "-Wbackend-plugin" // spurios warning for hmx intrinsics - -// requires external HMX lock -static void core_dot_chunk_fp16(__fp16 *restrict output, const __fp16 *restrict activation, const __fp16 *restrict weight, const __fp16 *restrict scales, - int n_row_tiles, int n_col_tiles, int n_dot_tiles) { - __builtin_assume(n_row_tiles > 0); - __builtin_assume(n_col_tiles > 0); - __builtin_assume(n_dot_tiles > 0); - - Q6_bias_mxmem2_A((void *)scales); - for (int r = 0; r < n_row_tiles; ++r) { - for (size_t c = 0; c < n_col_tiles; ++c) { - Q6_mxclracc_hf(); - - const __fp16 *row_tiles = activation + r * n_dot_tiles * HMX_FP16_TILE_N_ELMS; - const __fp16 *col_tiles = weight + c * n_dot_tiles * HMX_FP16_TILE_N_ELMS; - - for (int k = 0, k_block; k < n_dot_tiles; k += k_block) { - k_block = hex_smin(n_dot_tiles - k, 32); - const uint32_t range = 2048u * (uint32_t)k_block - 1; - Q6_activation_hf_mxmem_RR_deep((unsigned int)row_tiles, range); - Q6_weight_hf_mxmem_RR((unsigned int)col_tiles, range); - row_tiles += k_block * HMX_FP16_TILE_N_ELMS; - col_tiles += k_block * HMX_FP16_TILE_N_ELMS; - } - - __fp16 *out_tile = output + (r * n_col_tiles + c) * HMX_FP16_TILE_N_ELMS; - Q6_mxmem_AR_after_hf(out_tile, 0); - } - } -} - -// --- Async HMX matmul job (for pipeline overlap) --- - -typedef struct { - __fp16 * output; - const __fp16 * activation; - const __fp16 * weight; - const __fp16 * scales; - uint32_t n_row_tiles; - uint32_t n_col_tiles; - uint32_t n_dot_tiles; -} hmx_matmul_job_t; - -static void hmx_matmul_worker_fn(void * data) { - hmx_matmul_job_t * job = (hmx_matmul_job_t *) data; - FARF(HIGH, "hmx-mm-job: n_row_tiles %u n_col_tiles %u n_dot_tiles %u", job->n_row_tiles, job->n_col_tiles, job->n_dot_tiles); - core_dot_chunk_fp16(job->output, job->activation, job->weight, job->scales, job->n_row_tiles, job->n_col_tiles, job->n_dot_tiles); -} - -static inline void hmx_matmul_job_init(hmx_matmul_job_t * job, - __fp16 * output, - const __fp16 * activation, - const __fp16 * weight, - const __fp16 * scales, - int n_row_tiles, - int n_col_tiles, - int n_dot_tiles) { - job->output = output; - job->activation = activation; - job->weight = weight; - job->scales = scales; - job->n_row_tiles = n_row_tiles; - job->n_col_tiles = n_col_tiles; - job->n_dot_tiles = n_dot_tiles; -} - -// output : fp16 -> f32p - -static void transfer_output_chunk_fp16_to_fp32(float *restrict dst, const __fp16 *restrict vtcm_src, int n_rows, int n_cols, int n) { - assert(n_cols % HMX_FP16_TILE_N_COLS == 0); - const size_t tile_row_stride = (n_cols / HMX_FP16_TILE_N_COLS) * HMX_FP16_TILE_N_ELMS; - - const HVX_Vector one = hvx_vec_splat_f16(1.0); - - for (size_t r = 0; r < n_rows; r += 2) { - const size_t r0 = r / HMX_FP16_TILE_N_ROWS; - const size_t r1 = (r % HMX_FP16_TILE_N_ROWS) / 2; // index of the row pair within the tile - const __fp16 *row_base = vtcm_src + r0 * tile_row_stride; - float *output_row_base = dst + r * n; // global memory row base for row r (and r+1) - - #pragma unroll(4) - for (size_t c = 0; c < n_cols; c += HMX_FP16_TILE_N_COLS) { - const size_t c0 = c / HMX_FP16_TILE_N_COLS; - const __fp16 *tile = row_base + c0 * HMX_FP16_TILE_N_ELMS; - HVX_Vector v = ((const HVX_Vector *) tile)[r1]; - HVX_VectorPair vp = Q6_Wqf32_vmpy_VhfVhf(v, one); - - volatile HVX_Vector *pv_out0 = (volatile HVX_Vector *) (output_row_base + c + 0); - volatile HVX_Vector *pv_out1 = (volatile HVX_Vector *) (output_row_base + c + n); // next row in global memory - - *pv_out0 = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(vp)); - if (r + 1 < n_rows) { - *pv_out1 = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(vp)); - } - } - } -} - -typedef struct { - const __fp16 *vtcm_src; - float *dst; - int n_tasks; - int n_tot_chunks; - int n_chunks_per_task; - int n_cols; - int n; // DDR row stride (total output columns) -} output_transfer_task_state_t; - -static void transfer_output_chunk_worker_fn(unsigned int n, unsigned int i, void *data) { - output_transfer_task_state_t *st = (output_transfer_task_state_t *) data; - - for (unsigned int task_id = i; task_id < (unsigned int)st->n_tasks; task_id += n) { - int chunk_idx = task_id * st->n_chunks_per_task; - size_t chunk_size = hex_smin(st->n_tot_chunks - chunk_idx, st->n_chunks_per_task); - - float *dst = st->dst + chunk_idx * st->n; - const __fp16 *vtcm_src = st->vtcm_src + chunk_idx * st->n_cols; - transfer_output_chunk_fp16_to_fp32(dst, vtcm_src, chunk_size, st->n_cols, st->n); - } -} - -static void transfer_output_chunk_threaded(struct htp_context *ctx, float *dst, const __fp16 *vtcm_src, - int n_rows, int n_cols, int n, int n_threads) { - assert(n_cols % HMX_FP16_TILE_N_COLS == 0); - - size_t n_tot_chunks = n_rows; - size_t n_chunks_per_task = (n_threads == 1) ? n_tot_chunks : HMX_FP16_TILE_N_ROWS; // must be multiple of HMX_FP16_TILE_N_ROWS (32) - - output_transfer_task_state_t state; - state.n_tasks = (n_tot_chunks + n_chunks_per_task - 1) / n_chunks_per_task; - state.n_tot_chunks = n_tot_chunks; - state.n_chunks_per_task = n_chunks_per_task; - state.dst = dst; - state.vtcm_src = vtcm_src; - state.n_cols = n_cols; - state.n = n; - - if (state.n_tasks == 1 || n_threads == 1) { - transfer_output_chunk_worker_fn(1, 0, &state); - } else { - worker_pool_run_func(ctx->worker_pool, transfer_output_chunk_worker_fn, &state, n_threads); - } -} - -// activations : fp32 -> fp16 - -static void transfer_activation_chunk_fp32_to_fp16(__fp16 *restrict vtcm_dst, const float *restrict src, int n_rows, int k_block, int k_stride) { - const int n_rows_padded = hex_align_up(n_rows, HMX_FP16_TILE_N_ROWS); - const int n_rows_tiled = (n_rows / HMX_FP16_TILE_N_ROWS) * HMX_FP16_TILE_N_ROWS; - - int r = 0; - - #pragma unroll(2) - for (r = 0; r < n_rows_tiled; r += 2) { - int r0 = r / HMX_FP16_TILE_N_ROWS; // tile row index - int r1 = r % HMX_FP16_TILE_N_ROWS; // intra-tile row idx - - const HVX_Vector *pv_in0 = (const HVX_Vector *) (src + (r + 0) * k_stride); - const HVX_Vector *pv_in1 = (const HVX_Vector *) (src + (r + 1) * k_stride); - for (int c = 0; c < k_block; c += 32) { - HVX_Vector v0 = *pv_in0++; - HVX_Vector v1 = *pv_in1++; - - HVX_Vector v_out = hvx_vec_f32_to_f16_shuff(v0, v1); - - // compute output position - int c0 = c / HMX_FP16_TILE_N_COLS; // tile column index - int tile_idx = r0 * (k_block / HMX_FP16_TILE_N_COLS) + c0; - - HVX_Vector *tile = (HVX_Vector *) (vtcm_dst + tile_idx * HMX_FP16_TILE_N_ELMS); - tile[r1 / 2] = v_out; - } - } - - for (; r < n_rows_padded; r += 2) { - int r0 = r / HMX_FP16_TILE_N_ROWS; // tile row index - int r1 = r % HMX_FP16_TILE_N_ROWS; // intra-tile row idx - - const bool row0_valid = r < n_rows; - const bool row1_valid = (r + 1) < n_rows; - - const HVX_Vector *pv_in0 = row0_valid ? (const HVX_Vector *) (src + (r + 0) * k_stride) : NULL; - const HVX_Vector *pv_in1 = row1_valid ? (const HVX_Vector *) (src + (r + 1) * k_stride) : NULL; - for (int c = 0; c < k_block; c += 32) { - HVX_Vector v0 = row0_valid ? *pv_in0++ : Q6_V_vzero(); - HVX_Vector v1 = row1_valid ? *pv_in1++ : Q6_V_vzero(); - - HVX_Vector v_out = hvx_vec_f32_to_f16_shuff(v0, v1); - - // compute output position - int c0 = c / HMX_FP16_TILE_N_COLS; // tile column index - int tile_idx = r0 * (k_block / HMX_FP16_TILE_N_COLS) + c0; - - HVX_Vector *tile = (HVX_Vector *) (vtcm_dst + tile_idx * HMX_FP16_TILE_N_ELMS); - tile[r1 / 2] = v_out; - } - } -} - -typedef struct { - __fp16 *dst; - const float *src; - int n_tasks; - int n_tot_chunks; - int n_chunks_per_task; - int k_block; - int k_stride; -} activation_transfer_task_state_t; - -static void transfer_activation_chunk_worker_fn(unsigned int n, unsigned int i, void *data) { - activation_transfer_task_state_t *st = (activation_transfer_task_state_t *) data; - - for (unsigned int task_id = i; task_id < (unsigned int)st->n_tasks; task_id += n) { - // one chunk: one row - int chunk_idx = task_id * st->n_chunks_per_task; - size_t chunk_size = hex_smin(st->n_tot_chunks - chunk_idx, st->n_chunks_per_task); - - __fp16 *dst = st->dst + chunk_idx * st->k_block; - const float *src = st->src + chunk_idx * st->k_stride; - transfer_activation_chunk_fp32_to_fp16(dst, src, chunk_size, st->k_block, st->k_stride); - } -} - -static void transfer_activation_chunk_threaded(struct htp_context *ctx, __fp16 *dst, const float *src, int n_rows, int k_block, int k_stride, int n_threads) { - assert(k_block % HMX_FP16_TILE_N_COLS == 0 && k_stride % HMX_FP16_TILE_N_COLS == 0); - assert(VLEN == 32 * sizeof(float)); - - size_t n_tot_chunks = n_rows; - size_t n_chunks_per_task = (n_threads == 1) ? n_tot_chunks : 32; // must be multiple of 32 to ensure correct destination address - - activation_transfer_task_state_t state; - state.n_tasks = (n_tot_chunks + n_chunks_per_task - 1) / n_chunks_per_task; - state.n_tot_chunks = n_tot_chunks; - state.n_chunks_per_task = n_chunks_per_task; - state.dst = dst; - state.src = src; - state.k_block = k_block; - state.k_stride = k_stride; - - if (state.n_tasks == 1 || n_threads == 1) { - transfer_activation_chunk_worker_fn(1, 0, &state); - } else { - worker_pool_run_func(ctx->worker_pool, transfer_activation_chunk_worker_fn, &state, n_threads); - } -} - -// C += AB -static void core_mma_chunk_fp16(__fp16 *restrict c, const __fp16 *restrict a, const __fp16 *restrict b, - const __fp16 *restrict col_scales, const __fp16 *restrict eye_tile, - int n_row_tiles, int n_col_tiles, int n_dot_tiles, bool zero_init) { - __builtin_assume(n_row_tiles > 0); - __builtin_assume(n_col_tiles > 0); - __builtin_assume(n_dot_tiles > 0); - - Q6_bias_mxmem2_A((void *)col_scales); - - const size_t dot_tile_stride = n_dot_tiles * HMX_FP16_TILE_N_ELMS; - for (size_t i = 0; i < n_row_tiles; ++i) { - const __fp16 *row_base = a + i * dot_tile_stride; - __fp16 *res_base = c + i * n_col_tiles * HMX_FP16_TILE_N_ELMS; - for (size_t j = 0; j < n_col_tiles; ++j) { - Q6_mxclracc_hf(); - - const __fp16 *col_tiles = b + j * dot_tile_stride; - const __fp16 *row_tiles = row_base; - __fp16 *accum_tile = res_base + j * HMX_FP16_TILE_N_ELMS; - if (!zero_init) { - Q6_activation_hf_mxmem_RR((unsigned int)accum_tile, 2047); - Q6_weight_hf_mxmem_RR((unsigned int)eye_tile, 2047); - } - - for (int k = 0, k_block; k < n_dot_tiles; k += k_block) { - k_block = hex_smin(n_dot_tiles - k, 32); - const uint32_t range = 2048u * (uint32_t)k_block - 1; - Q6_activation_hf_mxmem_RR_deep((unsigned int)row_tiles, range); - Q6_weight_hf_mxmem_RR((unsigned int)col_tiles, range); - row_tiles += k_block * HMX_FP16_TILE_N_ELMS; - col_tiles += k_block * HMX_FP16_TILE_N_ELMS; - } - - Q6_mxmem_AR_after_hf(accum_tile, 0); - } - } -} - -int hmx_matmul_2d_f32(struct htp_context *ctx, float *restrict dst, const float *restrict activation, - const uint8_t *restrict permuted_weight, int m, int k, int n, - int act_stride, int weight_stride, int weight_type) { - if (k % 32 != 0 || n % 32 != 0) { return -1; } - - if (!hex_is_aligned(dst, VLEN) || !hex_is_aligned(activation, VLEN) || !hex_is_aligned(permuted_weight, VLEN)) { - return -1; - } - - size_t row_stride = get_x4x2_row_stride(weight_type, k); - if (row_stride == 0) { - return -1; - } - - worker_callback_t dequant_worker_fn = NULL; - switch (weight_type) { - case HTP_TYPE_Q4_0: dequant_worker_fn = dequantize_x4x2_worker_loop_q4_0; break; - case HTP_TYPE_IQ4_NL: dequant_worker_fn = dequantize_x4x2_worker_loop_iq4_nl; break; - case HTP_TYPE_Q4_1: dequant_worker_fn = dequantize_x4x2_worker_loop_q4_1; break; - case HTP_TYPE_MXFP4: dequant_worker_fn = dequantize_x4x2_worker_loop_mxfp4; break; - case HTP_TYPE_Q8_0: dequant_worker_fn = dequantize_x4x2_worker_loop_q8_0; break; - case HTP_TYPE_F16: dequant_worker_fn = convert_f16_worker_loop; break; - case HTP_TYPE_F32: dequant_worker_fn = quantize_f32_worker_loop; break; - default: - return -1; - } - - const int n_k_tiles = k / HMX_FP16_TILE_N_COLS; - const struct fastdiv_values n_k_tiles_div = init_fastdiv_values(n_k_tiles); - - // --- Dynamic Mode Configuration --- - const bool use_pipeline = (m > 32); - const int num_threads = (m <= 32) ? 1 : ctx->n_threads; - - // --- Dynamic VTCM layout --- - const size_t vec_dot_size = k * sizeof(__fp16); - const size_t vtcm_budget = ctx->vtcm_size; - size_t vtcm_used = 0; - - // Pipeline = 4-stage DMA→dequant→HMX→store with HMX worker overlap. - const size_t size_per_n = row_stride + (use_pipeline ? 2 * vec_dot_size : vec_dot_size); // Q + S0 + S1 (dequant bufs) - const size_t size_per_mn = (use_pipeline ? 2 : 1) * sizeof(__fp16); // O x 2 (output double buffer) - - size_t m_chunk_n_rows = 0, n_chunk_n_cols = 0; - if (hmx_compute_chunks(vtcm_budget, /*overhead=*/256, size_per_n, /*per_m=*/vec_dot_size, size_per_mn, - hex_align_up(m, HMX_FP16_TILE_N_ROWS), n, - /*m_block_cost=*/(size_t) n * 3, - /*n_block_cost=*/(size_t) m * 2, &m_chunk_n_rows, &n_chunk_n_cols, &vtcm_used)) { - FARF(HIGH, "hmx-mm-2d: VTCM too small : m %d k %d n %d budget %zu", m, k, n, vtcm_budget); - return -1; - } - - const size_t weight_area_size = hex_align_up(n_chunk_n_cols * row_stride, HMX_FP16_TILE_SIZE); - const size_t act_area_size = hex_align_up(m_chunk_n_rows * vec_dot_size, HMX_FP16_TILE_SIZE); - const size_t output_area_size = hex_align_up(m_chunk_n_rows * n_chunk_n_cols * sizeof(__fp16), HMX_FP16_TILE_SIZE); - - size_t scratch0_size, scratch1_size, scratch2_size; - scratch0_size = hex_align_up(n_chunk_n_cols * vec_dot_size, HMX_FP16_TILE_SIZE); // dequant buf 0 - scratch1_size = use_pipeline ? scratch0_size : 0; // dequant buf 1 - scratch2_size = use_pipeline ? output_area_size : 0; // output buf 1 - - uint8_t *vtcm_ptr = (uint8_t *) ctx->vtcm_base; - __fp16 *vtcm_weight = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, weight_area_size); - __fp16 *vtcm_activation = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, act_area_size); - __fp16 *vtcm_output = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, output_area_size); - void *vtcm_scratch0 = vtcm_seq_alloc(&vtcm_ptr, scratch0_size); - void *vtcm_scratch1 = scratch1_size ? vtcm_seq_alloc(&vtcm_ptr, scratch1_size) : NULL; - void *vtcm_scratch2 = scratch2_size ? vtcm_seq_alloc(&vtcm_ptr, scratch2_size) : NULL; - __fp16 *vtcm_scales = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, 256); - - vtcm_used = vtcm_ptr - (uint8_t *) ctx->vtcm_base; - if (vtcm_used > vtcm_budget) { - FARF(ERROR, "hmx-mm-2d: VTCM overflow: used %zu budget %zu", vtcm_used, vtcm_budget); - return -1; - } - - hmx_init_column_scales(vtcm_scales, Q6_V_vsplat_R(0x3c00)); // scale: 1.0, bias: 0.0 in FP16 - - FARF(HIGH, "hmx-mm-2d: standard : m %d k %d n %d wtype %d mc %zu nc %zu vtcm %zu/%zu", - m, k, n, weight_type, m_chunk_n_rows, n_chunk_n_cols, vtcm_used, vtcm_budget); - - TIMER_DEFINE(activation_load); - TIMER_DEFINE(weight_load); - TIMER_DEFINE(hmx_core); - TIMER_DEFINE(output_store); - - TIMER_DEFINE(total); - TIMER_START(total); - - int n_chunk_cnt = hmx_ceil_div(n, n_chunk_n_cols); - - if (use_pipeline) { - // --- Asynchronous Pipelined Loop (Current implementation) --- - hmx_matmul_job_t job_slots[2]; // persistent double-buffered job descriptors - - for (size_t mr = 0; mr < m; mr += m_chunk_n_rows) { - const size_t n_rows = hex_smin(m - mr, m_chunk_n_rows); - - void *vtcm_qweight = vtcm_weight; - void *vtcm_weight_bufs[2] = { vtcm_scratch0, vtcm_scratch1 }; - void *vtcm_output_bufs[2] = { vtcm_output, vtcm_scratch2 }; - - // prologue: A0 - const size_t n_cols_A0 = hex_smin(n - 0 * n_chunk_n_cols, n_chunk_n_cols); - { - const uint8_t *qweight_chunk_A0 = permuted_weight; - dma_queue_push(ctx->dma[0], dma_make_ptr(vtcm_qweight, qweight_chunk_A0), row_stride, weight_stride, row_stride, n_cols_A0); - } - - { - const float *activation_chunk = activation + mr * act_stride; - transfer_activation_chunk_threaded(ctx, vtcm_activation, activation_chunk, n_rows, k, act_stride, num_threads); - } - - // prologue: B0, A1, submit C0 (async), B1 (overlaps C0) - { - // B0: wait for DMA, dequant weight chunk 0 - dma_queue_pop(ctx->dma[0]); - dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[0], vtcm_qweight, n_cols_A0, k, row_stride, weight_type, n_k_tiles, n_k_tiles_div, dequant_worker_fn, num_threads); - - // A1: issue DMA for weight chunk 1 - const size_t n_cols_A1 = hex_smin(n - 1 * n_chunk_n_cols, n_chunk_n_cols); - if (1 < n_chunk_cnt) { - const uint8_t *qweight_chunk_A1 = permuted_weight + n_chunk_n_cols * weight_stride; - dma_queue_push(ctx->dma[0], dma_make_ptr(vtcm_qweight, qweight_chunk_A1), row_stride, weight_stride, row_stride, n_cols_A1); - } - - // submit C0 (non-blocking — HMX worker executes in parallel) - hmx_matmul_job_init(&job_slots[0], (__fp16 *) vtcm_output_bufs[0], (__fp16 *) vtcm_activation, - (__fp16 *) vtcm_weight_bufs[0], vtcm_scales, - hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS), - hmx_ceil_div(n_cols_A0, HMX_FP16_TILE_N_COLS), k / HMX_FP16_TILE_N_ROWS); - hmx_queue_push(ctx->hmx_queue, hmx_queue_make_desc(hmx_matmul_worker_fn, &job_slots[0])); - - // B1: DMA pop + dequant (runs in parallel with C0 on HMX worker) - if (1 < n_chunk_cnt) { - dma_queue_pop(ctx->dma[0]); - dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[1], vtcm_qweight, n_cols_A1, k, row_stride, weight_type, n_k_tiles, n_k_tiles_div, dequant_worker_fn, num_threads); - } - } - - // main loop: wait C_i → submit C_{i+1} → D_i + B_{i+2} (parallel with C_{i+1}) - for (int i = 0; i < n_chunk_cnt; ++i) { - const size_t nc = i * n_chunk_n_cols; - const size_t nc_p1 = nc + 1 * n_chunk_n_cols; - const size_t nc_p2 = nc + 2 * n_chunk_n_cols; - - const size_t n_cols = hex_smin(n - nc, n_chunk_n_cols); - const size_t n_cols_p1 = hex_smin(n - nc_p1, n_chunk_n_cols); - const size_t n_cols_p2 = hex_smin(n - nc_p2, n_chunk_n_cols); - - // issue A_{i+2}: DMA push (non-blocking) - if (i + 2 < n_chunk_cnt) { - const uint8_t *qweight_chunk_p2 = permuted_weight + nc_p2 * weight_stride; - dma_queue_push(ctx->dma[0], dma_make_ptr(vtcm_qweight, qweight_chunk_p2), row_stride, weight_stride, row_stride, n_cols_p2); - } - - // wait C_i: block until prologue/previous C completes - hmx_queue_pop(ctx->hmx_queue); - - // submit C_{i+1} (non-blocking, overlaps with D_i + B_{i+2} below) - if (i + 1 < n_chunk_cnt) { - hmx_matmul_job_init(&job_slots[(i + 1) % 2], (__fp16 *) vtcm_output_bufs[(i + 1) % 2], - (__fp16 *) vtcm_activation, (__fp16 *) vtcm_weight_bufs[(i + 1) % 2], - vtcm_scales, hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS), - hmx_ceil_div(n_cols_p1, HMX_FP16_TILE_N_COLS), k / HMX_FP16_TILE_N_ROWS); - hmx_queue_push(ctx->hmx_queue, hmx_queue_make_desc(hmx_matmul_worker_fn, &job_slots[(i + 1) % 2])); - } - - // D_i: store output (multi-thread HVX, parallel with C_{i+1}) - float *output_chunk = dst + (mr * n + nc); - transfer_output_chunk_threaded(ctx, output_chunk, vtcm_output_bufs[i % 2], n_rows, n_cols, n, num_threads); - - // B_{i+2}: DMA pop + dequant (multi-thread HVX, parallel with C_{i+1}) - if (i + 2 < n_chunk_cnt) { - dma_queue_pop(ctx->dma[0]); - dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[(i + 2) % 2], vtcm_qweight, n_cols_p2, k, row_stride, weight_type, n_k_tiles, n_k_tiles_div, dequant_worker_fn, num_threads); - } - } - } - hmx_queue_suspend(ctx->hmx_queue); - } else { - // --- Synchronous Loop (Optimized for small/non-pipelined cases) --- - HAP_compute_res_hmx_lock(ctx->vtcm_rctx); - - for (size_t mr = 0; mr < m; mr += m_chunk_n_rows) { - const size_t n_rows = hex_smin(m - mr, m_chunk_n_rows); - const size_t n_row_tiles = hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS); - - // Load Activation - const float *activation_chunk = activation + mr * act_stride; - transfer_activation_chunk_threaded(ctx, vtcm_activation, activation_chunk, n_rows, k, act_stride, num_threads); - - for (size_t nc = 0; nc < n; nc += n_chunk_n_cols) { - const size_t n_cols = hex_smin(n - nc, n_chunk_n_cols); - const size_t n_col_tiles = hmx_ceil_div(n_cols, HMX_FP16_TILE_N_COLS); - - // A: DMA Load Weight - const uint8_t *qweight_chunk = permuted_weight + nc * weight_stride; - dma_queue_push(ctx->dma[0], dma_make_ptr(vtcm_weight, qweight_chunk), row_stride, weight_stride, row_stride, n_cols); - dma_queue_pop(ctx->dma[0]); - - // B: Dequantize / Convert Weight - dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_scratch0, vtcm_weight, n_cols, k, row_stride, weight_type, n_k_tiles, n_k_tiles_div, dequant_worker_fn, num_threads); - - // C: HMX Compute (Synchronous) - core_dot_chunk_fp16(vtcm_output, vtcm_activation, vtcm_scratch0, vtcm_scales, n_row_tiles, n_col_tiles, k / HMX_FP16_TILE_N_ROWS); - - // D: Output Store - float *output_chunk = dst + (mr * n + nc); - transfer_output_chunk_threaded(ctx, output_chunk, vtcm_output, n_rows, n_cols, n, num_threads); - } - } - HAP_compute_res_hmx_unlock(ctx->vtcm_rctx); - } - - TIMER_STOP(total); - -#if defined(ENABLE_PROFILE_TIMERS) - FARF(HIGH, "hex-mm-2d: %lld us : m %d k %d n %d", TIMER_US(total), m, k, n); - if (!use_pipeline) { - FARF(HIGH, " activation_load: %lld us, weight_load: %lld us, hmx_core: %lld us, output_store: %lld us", - TIMER_US(activation_load), TIMER_US(weight_load), TIMER_US(hmx_core), TIMER_US(output_store)); - size_t weight_size = (size_t)n * row_stride; - float bandwidth = 1e-3f * weight_size / (float)TIMER_US(weight_load); - FARF(HIGH, " weight load bandwidth: %.2f GB/s", bandwidth); - } -#endif - - return 0; -} - -// - -static inline int hmx_matmul_batch_r2(const hmx_matmul_f16_f32_batched_params_t *params) { - return params->ne02 > 0 ? params->ne12 / params->ne02 : 1; -} - -static inline int hmx_matmul_batch_r3(const hmx_matmul_f16_f32_batched_params_t *params) { - return params->ne03 > 0 ? params->ne13 / params->ne03 : 1; -} - -static inline const __fp16 *hmx_matmul_weight_batch_ptr(const hmx_matmul_f16_f32_batched_params_t *params, - int dst_b2, int dst_b3) { - const int r2 = hmx_matmul_batch_r2(params); - const int r3 = hmx_matmul_batch_r3(params); - return (const __fp16 *) ((const uint8_t *) params->permuted_weight + - (size_t) (dst_b2 / r2) * params->src0_nb2 + - (size_t) (dst_b3 / r3) * params->src0_nb3); -} - -static inline const float *hmx_matmul_activation_batch_ptr(const hmx_matmul_f16_f32_batched_params_t *params, - int dst_b2, int dst_b3) { - return (const float *) ((const uint8_t *) params->activation + - (size_t) dst_b2 * params->src1_nb2 + - (size_t) dst_b3 * params->src1_nb3); -} - -static inline float *hmx_matmul_dst_batch_ptr(const hmx_matmul_f16_f32_batched_params_t *params, - int dst_b2, int dst_b3) { - return (float *) ((uint8_t *) params->dst + - (size_t) dst_b2 * params->dst_nb2 + - (size_t) dst_b3 * params->dst_nb3); -} - -static int hmx_matmul_f16_f32_batched_legacy(struct htp_context *ctx, - const hmx_matmul_f16_f32_batched_params_t *params) { - int ret = 0; - for (int b3 = 0; b3 < params->ne13 && ret == 0; ++b3) { - for (int b2 = 0; b2 < params->ne12 && ret == 0; ++b2) { - ret = hmx_matmul_f16_f32(ctx, hmx_matmul_dst_batch_ptr(params, b2, b3), - hmx_matmul_activation_batch_ptr(params, b2, b3), - hmx_matmul_weight_batch_ptr(params, b2, b3), - params->m, params->k, params->n, - params->act_stride, params->weight_stride); - } - } - return ret; -} - -int hmx_matmul_f16_f32_batched(struct htp_context *ctx, const hmx_matmul_f16_f32_batched_params_t *params) { - if (!ctx || !params || !params->dst || !params->activation || !params->permuted_weight) { return -1; } - if (!params->m || !params->k || !params->n) { return -1; } - if (params->act_stride < params->k || params->weight_stride < params->k || params->dst_stride < params->n) { return -1; } - if (params->ne02 <= 0 || params->ne03 <= 0 || params->ne12 <= 0 || params->ne13 <= 0) { return -1; } - if (params->ne12 % params->ne02 != 0 || params->ne13 % params->ne03 != 0) { return -1; } - if (params->k % 32 != 0 || params->n % 32 != 0) { return -1; } - - if (!hex_is_aligned(params->dst, VLEN) || - !hex_is_aligned(params->activation, VLEN) || - !hex_is_aligned(params->permuted_weight, VLEN)) { - return -1; - } - - const int group_size = hmx_matmul_batch_r2(params); - - if (group_size <= 1) { - FARF(HIGH, "%s: no dim2 GQA reuse (group=%d), using legacy batched loop", __func__, group_size); - return hmx_matmul_f16_f32_batched_legacy(ctx, params); - } - - // Grouped path: reuse interleaved weight across all q_heads sharing a - // kv_head. Each q_head gets its own activation buffer in VTCM (so - // activation is loaded once per m_chunk and reused across all n_chunks), - // and each q_head is computed individually to avoid tile-major packing - // issues. m_chunk_n_rows is always a multiple of 32 (from - // hmx_compute_chunks), so per-head tile arrays don't overlap. - const size_t vtcm_budget = ctx->vtcm_size; - const size_t vec_dot_size = params->k * sizeof(__fp16); - - // When the activation has a large stride (e.g. permuted Q tensor with - // act_stride >> k), HVX vector loads from strided DDR thrash L2 cache. - // Allocate an F32 scratch buffer in VTCM and use 2D DMA to gather - // strided rows into a contiguous block before the F32->F16 conversion. - const bool use_dma_activation = (params->act_stride > params->k); - const size_t f32_scratch_per_m = use_dma_activation ? (size_t) params->k * sizeof(float) : 0; - - size_t m_chunk_n_rows = 0, n_chunk_n_cols = 0, vtcm_used = 0; - // FP16 weight: interleave and activation load have similar per-element cost. - if (hmx_compute_chunks(vtcm_budget, /*overhead=*/256, - /*per_n=*/3 * vec_dot_size, - /*per_m=*/group_size * vec_dot_size + f32_scratch_per_m, - /*per_mn=*/sizeof(__fp16), - hex_align_up(params->m, HMX_FP16_TILE_N_ROWS), params->n, - /*m_block_cost=*/(size_t) params->n, - /*n_block_cost=*/(size_t) params->m, &m_chunk_n_rows, &n_chunk_n_cols, &vtcm_used) != 0) { - FARF(HIGH, "%s: grouped path does not fit VTCM, falling back to legacy batched loop", __func__); - return hmx_matmul_f16_f32_batched_legacy(ctx, params); - } - - const size_t act_head_stride = m_chunk_n_rows * (size_t) params->k; // fp16 elements between heads - const size_t weight_area_size = hex_align_up(n_chunk_n_cols * vec_dot_size, HMX_FP16_TILE_SIZE); - const size_t activation_area_size = hex_align_up(group_size * m_chunk_n_rows * vec_dot_size, HMX_FP16_TILE_SIZE); - const size_t output_area_size = hex_align_up(m_chunk_n_rows * n_chunk_n_cols * sizeof(__fp16), HMX_FP16_TILE_SIZE); - const size_t scratch_area_size = hex_align_up(n_chunk_n_cols * vec_dot_size, HMX_FP16_TILE_SIZE); - const size_t f32_scratch_size = use_dma_activation - ? hex_align_up(m_chunk_n_rows * (size_t) params->k * sizeof(float), HMX_FP16_TILE_SIZE) : 0; - - uint8_t *vtcm_ptr = (uint8_t *) ctx->vtcm_base; - __fp16 *vtcm_weight = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, weight_area_size); - __fp16 *vtcm_activation = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, activation_area_size); - __fp16 *vtcm_output = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, output_area_size); - void *vtcm_scratch0 = vtcm_seq_alloc(&vtcm_ptr, scratch_area_size); - void *vtcm_scratch1 = vtcm_seq_alloc(&vtcm_ptr, scratch_area_size); - __fp16 *vtcm_scales = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, 256); - float *vtcm_f32_act = use_dma_activation ? (float *) vtcm_seq_alloc(&vtcm_ptr, f32_scratch_size) : NULL; - - if ((size_t) (vtcm_ptr - (uint8_t *) ctx->vtcm_base) > vtcm_budget) { - FARF(HIGH, "%s: grouped layout overflowed VTCM, falling back to legacy batched loop", __func__); - return hmx_matmul_f16_f32_batched_legacy(ctx, params); - } - - hmx_init_column_scales(vtcm_scales, Q6_V_vsplat_R(0x3c00)); // scale: 1.0, bias: 0.0 in FP16 - - FARF(HIGH, "%s: grouped path m=%d k=%d n=%d group=%d streams=%d mc=%zu nc=%zu vtcm=%zu/%zu", - __func__, params->m, params->k, params->n, group_size, params->ne13, - m_chunk_n_rows, n_chunk_n_cols, - (size_t) (vtcm_ptr - (uint8_t *) ctx->vtcm_base), vtcm_budget); - - TIMER_DEFINE(activation_load); - TIMER_DEFINE(weight_load); - TIMER_DEFINE(hmx_core); - TIMER_DEFINE(output_store); - TIMER_DEFINE(total); - - TIMER_START(total); - - const size_t fp16_row_bytes = (size_t) params->k * sizeof(__fp16); - const size_t weight_row_bytes = (size_t) params->weight_stride * sizeof(__fp16); - - HAP_compute_res_hmx_lock(ctx->vtcm_rctx); - - for (int b3 = 0; b3 < params->ne13; ++b3) { - for (int b2_base = 0; b2_base < params->ne12; b2_base += group_size) { - const __fp16 *weight_group = hmx_matmul_weight_batch_ptr(params, b2_base, b3); - - for (size_t mr = 0; mr < (size_t) params->m; mr += m_chunk_n_rows) { - const size_t n_rows = hex_smin((size_t) params->m - mr, m_chunk_n_rows); - const size_t n_row_tiles = hmx_ceil_div((int) n_rows, HMX_FP16_TILE_N_ROWS); - - // Pre-load activations for all heads in the group (once per m_chunk). - // When the source is strided (permuted Q), use 2D DMA to gather - // contiguous rows into a VTCM scratch buffer first, then HVX - // converts from the contiguous VTCM buffer. This avoids L2 cache - // thrashing from HVX loads at large strides. - TIMER_START(activation_load); - for (int g = 0; g < group_size; ++g) { - const float *activation_chunk = hmx_matmul_activation_batch_ptr(params, b2_base + g, b3) + mr * params->act_stride; - __fp16 *vtcm_act_g = vtcm_activation + (size_t) g * act_head_stride; - if (use_dma_activation) { - const size_t row_bytes = (size_t) params->k * sizeof(float); - const size_t stride_bytes = (size_t) params->act_stride * sizeof(float); - dma_queue_push(ctx->dma[0], - dma_make_ptr(vtcm_f32_act, activation_chunk), - row_bytes, stride_bytes, row_bytes, n_rows); - dma_queue_pop(ctx->dma[0]); - transfer_activation_chunk_threaded(ctx, vtcm_act_g, - vtcm_f32_act, (int) n_rows, - params->k, params->k, ctx->n_threads); - } else { - transfer_activation_chunk_threaded(ctx, vtcm_act_g, - activation_chunk, (int) n_rows, - params->k, params->act_stride, ctx->n_threads); - } - } - TIMER_STOP(activation_load); - - void *buf_curr = vtcm_scratch0; - void *buf_next = vtcm_scratch1; - - { - const size_t n_cols_first = hex_smin((size_t) params->n, n_chunk_n_cols); - dma_queue_push(ctx->dma[0], dma_make_ptr(buf_curr, weight_group), - fp16_row_bytes, weight_row_bytes, fp16_row_bytes, n_cols_first); - } - - for (size_t nc = 0; nc < (size_t) params->n; nc += n_chunk_n_cols) { - const size_t n_cols = hex_smin((size_t) params->n - nc, n_chunk_n_cols); - const size_t n_col_tiles = hmx_ceil_div((int) n_cols, HMX_FP16_TILE_N_COLS); - - TIMER_START(weight_load); - { - dma_queue_pop(ctx->dma[0]); - - const size_t nc_next = nc + n_chunk_n_cols; - if (nc_next < (size_t) params->n) { - const size_t n_cols_next = hex_smin((size_t) params->n - nc_next, n_chunk_n_cols); - const __fp16 *next_weight_chunk = weight_group + nc_next * params->weight_stride; - - dma_queue_push(ctx->dma[0], dma_make_ptr(buf_next, next_weight_chunk), - fp16_row_bytes, weight_row_bytes, fp16_row_bytes, n_cols_next); - } - - hmx_interleave_rows_to_tiles(vtcm_weight, (const __fp16 *) buf_curr, n_cols, params->k, params->k, - 0, n_cols); - hex_swap_ptr(&buf_curr, &buf_next); - } - TIMER_STOP(weight_load); - - // Reuse the interleaved weight for every q_head in this GQA group - for (int g = 0; g < group_size; ++g) { - TIMER_START(hmx_core); - { - const __fp16 * vtcm_act_g = vtcm_activation + (size_t) g * act_head_stride; - core_dot_chunk_fp16(vtcm_output, vtcm_act_g, vtcm_weight, vtcm_scales, n_row_tiles, n_col_tiles, - params->k / 32); - } - TIMER_STOP(hmx_core); - - TIMER_START(output_store); - { - float *output = hmx_matmul_dst_batch_ptr(params, b2_base + g, b3) + mr * params->dst_stride + nc; - transfer_output_chunk_threaded(ctx, output, vtcm_output, (int) n_rows, (int) n_cols, params->dst_stride, ctx->n_threads); - } - TIMER_STOP(output_store); - } - } - } - } - } - - HAP_compute_res_hmx_unlock(ctx->vtcm_rctx); - - TIMER_STOP(total); - -#if defined(ENABLE_PROFILE_TIMERS) - FARF(HIGH, "%s: %lld us, m=%d k=%d n=%d group=%d", __func__, TIMER_US(total), - params->m, params->k, params->n, group_size); - FARF(HIGH, " activation_load: %lld us, weight_load: %lld us, hmx_core: %lld us, output_store: %lld us", - TIMER_US(activation_load), TIMER_US(weight_load), TIMER_US(hmx_core), TIMER_US(output_store)); -#endif - - return 0; -} - -int hmx_matmul_f16_f32(struct htp_context *ctx, float *restrict dst, const float *restrict activation, - const __fp16 *restrict permuted_weight, int m, int k, int n, - int act_stride, int weight_stride) { - if (!dst || !activation || !permuted_weight || !m || !n || !k) { return -1; } - return hmx_matmul_2d_f32(ctx, dst, activation, (const uint8_t *)permuted_weight, m, k, n, - act_stride, weight_stride * (int)sizeof(__fp16), HTP_TYPE_F16); -} - -struct mmid_row_mapping { - uint32_t i1; - uint32_t i2; -}; - -typedef struct { - __fp16 *dst; - const float *src; - int n_tasks; - int n_tot_chunks; - int n_chunks_per_task; - int k_block; - const struct mmid_row_mapping *matrix_rows; - int cur_a; - int mapping_stride; - int ne11; - struct fastdiv_values ne11_div; - size_t nb11; - size_t nb12; - int start_row; - int cne1; -} activation_transfer_gathered_task_state_t; - -typedef struct { - const __fp16 *vtcm_src; - float *dst; - int n_tasks; - int n_tot_chunks; - int n_chunks_per_task; - int n_cols; - const struct mmid_row_mapping *matrix_rows; - int cur_a; - int mapping_stride; - size_t dst_nb1; - size_t dst_nb2; - int start_row; - int cne1; -} output_transfer_scattered_task_state_t; - -static void transfer_activation_chunk_fp32_to_fp16_gathered( - __fp16 *restrict vtcm_dst, - const float *restrict src, - int start_row, - int n_rows, - int k_block, - const struct mmid_row_mapping *matrix_rows, - int cur_a, - int mapping_stride, - int ne11, - const struct fastdiv_values * ne11_div, - size_t nb11, - size_t nb12, - int cne1) { - const int n_rows_padded = hex_align_up(n_rows, HMX_FP16_TILE_N_ROWS); - const int n_rows_tiled = (n_rows / HMX_FP16_TILE_N_ROWS) * HMX_FP16_TILE_N_ROWS; - - int r = 0; - - #pragma unroll(2) - for (r = 0; r < n_rows_tiled; r += 2) { - int r0 = r / HMX_FP16_TILE_N_ROWS; // tile row index - int r1 = r % HMX_FP16_TILE_N_ROWS; // intra-tile row idx - - int r_idx0 = start_row + r + 0; - int r_idx1 = start_row + r + 1; - - struct mmid_row_mapping mapping0 = matrix_rows[cur_a * mapping_stride + r_idx0]; - struct mmid_row_mapping mapping1 = matrix_rows[cur_a * mapping_stride + r_idx1]; - - int i11_0 = fastmodulo(mapping0.i1, ne11, ne11_div); - int i11_1 = fastmodulo(mapping1.i1, ne11, ne11_div); - - const float *row0_ptr = (const float *) ((const uint8_t *) src + i11_0 * nb11 + mapping0.i2 * nb12); - const float *row1_ptr = (const float *) ((const uint8_t *) src + i11_1 * nb11 + mapping1.i2 * nb12); - - const HVX_Vector *pv_in0 = (const HVX_Vector *) row0_ptr; - const HVX_Vector *pv_in1 = (const HVX_Vector *) row1_ptr; - - for (int c = 0; c < k_block; c += 32) { - HVX_Vector v0 = *pv_in0++; - HVX_Vector v1 = *pv_in1++; - - HVX_Vector v_out = hvx_vec_f32_to_f16_shuff(v0, v1); - - int c0 = c / HMX_FP16_TILE_N_COLS; // tile column index - int tile_idx = r0 * (k_block / HMX_FP16_TILE_N_COLS) + c0; - - HVX_Vector *tile = (HVX_Vector *) (vtcm_dst + tile_idx * HMX_FP16_TILE_N_ELMS); - tile[r1 / 2] = v_out; - } - } - - for (; r < n_rows_padded; r += 2) { - int r0 = r / HMX_FP16_TILE_N_ROWS; // tile row index - int r1 = r % HMX_FP16_TILE_N_ROWS; // intra-tile row idx - - const bool row0_valid = (start_row + r + 0) < cne1; - const bool row1_valid = (start_row + r + 1) < cne1; - - const float *row0_ptr = NULL; - const float *row1_ptr = NULL; - - if (row0_valid) { - struct mmid_row_mapping mapping0 = matrix_rows[cur_a * mapping_stride + (start_row + r + 0)]; - int i11_0 = fastmodulo(mapping0.i1, ne11, ne11_div); - row0_ptr = (const float *) ((const uint8_t *) src + i11_0 * nb11 + mapping0.i2 * nb12); - } - if (row1_valid) { - struct mmid_row_mapping mapping1 = matrix_rows[cur_a * mapping_stride + (start_row + r + 1)]; - int i11_1 = fastmodulo(mapping1.i1, ne11, ne11_div); - row1_ptr = (const float *) ((const uint8_t *) src + i11_1 * nb11 + mapping1.i2 * nb12); - } - - const HVX_Vector *pv_in0 = (const HVX_Vector *) row0_ptr; - const HVX_Vector *pv_in1 = (const HVX_Vector *) row1_ptr; - - for (int c = 0; c < k_block; c += 32) { - HVX_Vector v0 = row0_valid ? *pv_in0++ : Q6_V_vzero(); - HVX_Vector v1 = row1_valid ? *pv_in1++ : Q6_V_vzero(); - - HVX_Vector v_out = hvx_vec_f32_to_f16_shuff(v0, v1); - - int c0 = c / HMX_FP16_TILE_N_COLS; // tile column index - int tile_idx = r0 * (k_block / HMX_FP16_TILE_N_COLS) + c0; - - HVX_Vector *tile = (HVX_Vector *) (vtcm_dst + tile_idx * HMX_FP16_TILE_N_ELMS); - tile[r1 / 2] = v_out; - } - } -} - -static void transfer_activation_chunk_gathered_worker_fn(unsigned int n, unsigned int i, void *data) { - activation_transfer_gathered_task_state_t *st = data; - int chunk_idx = i; - int chunk_size = st->n_chunks_per_task; - int start_row = st->start_row + chunk_idx * chunk_size; - int n_rows = hex_smin(st->cne1 - start_row, chunk_size); - if (n_rows > 0) { - __fp16 *dst = st->dst + (size_t)(start_row - st->start_row) * st->k_block; - transfer_activation_chunk_fp32_to_fp16_gathered( - dst, st->src, start_row, n_rows, st->k_block, - st->matrix_rows, st->cur_a, st->mapping_stride, - st->ne11, &st->ne11_div, st->nb11, st->nb12, st->cne1); - } -} - -static void transfer_activation_chunk_gathered_threaded( - struct htp_context *ctx, - __fp16 *dst, - const float *src, - int start_row, - int n_rows, - int k_block, - const struct mmid_row_mapping *matrix_rows, - int cur_a, - int mapping_stride, - int ne11, - size_t nb11, - size_t nb12, - int cne1, - int n_threads) { - if (n_rows <= 0) return; - int chunks_per_thread = hmx_ceil_div(n_rows, n_threads); - chunks_per_thread = hex_align_up(chunks_per_thread, HMX_FP16_TILE_N_ROWS); - - int actual_threads = hmx_ceil_div(n_rows, chunks_per_thread); - - activation_transfer_gathered_task_state_t state = { - .dst = dst, - .src = src, - .n_tasks = actual_threads, - .n_tot_chunks = n_rows, - .n_chunks_per_task = chunks_per_thread, - .k_block = k_block, - .matrix_rows = matrix_rows, - .cur_a = cur_a, - .mapping_stride = mapping_stride, - .ne11 = ne11, - .ne11_div = init_fastdiv_values(ne11), - .nb11 = nb11, - .nb12 = nb12, - .start_row = start_row, - .cne1 = cne1, - }; - - if (actual_threads <= 1) { - transfer_activation_chunk_gathered_worker_fn(1, 0, &state); - } else { - worker_pool_run_func(ctx->worker_pool, transfer_activation_chunk_gathered_worker_fn, &state, actual_threads); - } -} - -static void transfer_output_chunk_fp16_to_fp32_scattered( - float *restrict dst, - const __fp16 *restrict vtcm_src, - int start_row, - int n_rows, - int n_cols, - const struct mmid_row_mapping *matrix_rows, - int cur_a, - int mapping_stride, - size_t dst_nb1, - size_t dst_nb2, - int cne1) { - assert(n_cols % HMX_FP16_TILE_N_COLS == 0); - const size_t tile_row_stride = (n_cols / HMX_FP16_TILE_N_COLS) * HMX_FP16_TILE_N_ELMS; - - const HVX_Vector one = hvx_vec_splat_f16(1.0); - - for (size_t r = 0; r < n_rows; r += 2) { - const size_t r0 = r / HMX_FP16_TILE_N_ROWS; - const size_t r1 = (r % HMX_FP16_TILE_N_ROWS) / 2; // index of the row pair within the tile - const __fp16 *row_base = vtcm_src + r0 * tile_row_stride; - - int r_idx0 = start_row + (int)r + 0; - int r_idx1 = start_row + (int)r + 1; - - if (r_idx0 >= cne1) break; - - struct mmid_row_mapping mapping0 = matrix_rows[cur_a * mapping_stride + r_idx0]; - float *output_row0 = (float *) ((uint8_t *) dst + mapping0.i1 * dst_nb1 + mapping0.i2 * dst_nb2); - - float *output_row1 = NULL; - if (r_idx1 < cne1) { - struct mmid_row_mapping mapping1 = matrix_rows[cur_a * mapping_stride + r_idx1]; - output_row1 = (float *) ((uint8_t *) dst + mapping1.i1 * dst_nb1 + mapping1.i2 * dst_nb2); - } - - #pragma unroll(4) - for (size_t c = 0; c < (size_t)n_cols; c += HMX_FP16_TILE_N_COLS) { - const size_t c0 = c / HMX_FP16_TILE_N_COLS; - const __fp16 *tile = row_base + c0 * HMX_FP16_TILE_N_ELMS; - HVX_Vector v = ((const HVX_Vector *) tile)[r1]; - HVX_VectorPair vp = Q6_Wqf32_vmpy_VhfVhf(v, one); - - volatile HVX_Vector *pv_out0 = (volatile HVX_Vector *) (output_row0 + c); - volatile HVX_Vector *pv_out1 = output_row1 ? (volatile HVX_Vector *) (output_row1 + c) : NULL; - - *pv_out0 = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(vp)); - if (pv_out1) { - *pv_out1 = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(vp)); - } - } - } -} - -static void transfer_output_chunk_scattered_worker_fn(unsigned int n, unsigned int i, void *data) { - output_transfer_scattered_task_state_t *st = data; - int chunk_idx = i; - int chunk_size = st->n_chunks_per_task; - int start_row = st->start_row + chunk_idx * chunk_size; - int n_rows = hex_smin(st->cne1 - start_row, chunk_size); - if (n_rows > 0) { - const __fp16 *src = st->vtcm_src + (size_t)(start_row - st->start_row) * st->n_cols; - transfer_output_chunk_fp16_to_fp32_scattered( - st->dst, src, start_row, n_rows, st->n_cols, - st->matrix_rows, st->cur_a, st->mapping_stride, - st->dst_nb1, st->dst_nb2, st->cne1); - } -} - -static void transfer_output_chunk_scattered_threaded( - struct htp_context *ctx, - float *dst, - const __fp16 *vtcm_src, - int start_row, - int n_rows, - int n_cols, - const struct mmid_row_mapping *matrix_rows, - int cur_a, - int mapping_stride, - size_t dst_nb1, - size_t dst_nb2, - int cne1, - int n_threads) { - if (n_rows <= 0) return; - int chunks_per_thread = hmx_ceil_div(n_rows, n_threads); - chunks_per_thread = hex_align_up(chunks_per_thread, HMX_FP16_TILE_N_ROWS); - - int actual_threads = hmx_ceil_div(n_rows, chunks_per_thread); - - output_transfer_scattered_task_state_t state = { - .vtcm_src = vtcm_src, - .dst = dst, - .n_tasks = actual_threads, - .n_tot_chunks = n_rows, - .n_chunks_per_task = chunks_per_thread, - .n_cols = n_cols, - .matrix_rows = matrix_rows, - .cur_a = cur_a, - .mapping_stride = mapping_stride, - .dst_nb1 = dst_nb1, - .dst_nb2 = dst_nb2, - .start_row = start_row, - .cne1 = cne1, - }; - - if (actual_threads <= 1) { - transfer_output_chunk_scattered_worker_fn(1, 0, &state); - } else { - worker_pool_run_func(ctx->worker_pool, transfer_output_chunk_scattered_worker_fn, &state, actual_threads); - } -} - -int hmx_matmul_id_2d_f32(struct htp_context *ctx, - float *restrict dst, - const float *activation, - const uint8_t *permuted_weight, - int m, int k, int n, - int ne11, - size_t act_nb1, size_t act_nb2, - size_t dst_nb1, size_t dst_nb2, - int weight_stride, - int weight_type, - const struct mmid_row_mapping *matrix_rows, - int cur_a, - int mapping_stride) { - const int cne1 = m; - const int m_padded = hex_align_up(m, 32); - - if (k % 32 != 0 || n % 32 != 0) { return -1; } - - if (!hex_is_aligned(dst, VLEN) || !hex_is_aligned(activation, VLEN) || !hex_is_aligned(permuted_weight, VLEN)) { - return -1; - } - - size_t row_stride = get_x4x2_row_stride(weight_type, k); - if (row_stride == 0) { - return -1; - } - - worker_callback_t dequant_worker_fn = NULL; - switch (weight_type) { - case HTP_TYPE_Q4_0: dequant_worker_fn = dequantize_x4x2_worker_loop_q4_0; break; - case HTP_TYPE_IQ4_NL: dequant_worker_fn = dequantize_x4x2_worker_loop_iq4_nl; break; - case HTP_TYPE_Q4_1: dequant_worker_fn = dequantize_x4x2_worker_loop_q4_1; break; - case HTP_TYPE_MXFP4: dequant_worker_fn = dequantize_x4x2_worker_loop_mxfp4; break; - case HTP_TYPE_Q8_0: dequant_worker_fn = dequantize_x4x2_worker_loop_q8_0; break; - case HTP_TYPE_F16: dequant_worker_fn = convert_f16_worker_loop; break; - case HTP_TYPE_F32: dequant_worker_fn = quantize_f32_worker_loop; break; - default: - return -1; - } - - const int n_k_tiles = k / HMX_FP16_TILE_N_COLS; - const struct fastdiv_values n_k_tiles_div = init_fastdiv_values(n_k_tiles); - - const int num_threads = ctx->n_threads; - - const size_t vec_dot_size = k * sizeof(__fp16); - const size_t vtcm_budget = ctx->vtcm_size; - size_t vtcm_used = 0; - - const size_t size_per_n = row_stride + vec_dot_size; - const size_t size_per_mn = sizeof(__fp16); - - size_t m_chunk_n_rows = 0, n_chunk_n_cols = 0; - if (hmx_compute_chunks(vtcm_budget, /*overhead=*/256, size_per_n, /*per_m=*/vec_dot_size, size_per_mn, - m_padded, n, - /*m_block_cost=*/(size_t) n * 3, - /*n_block_cost=*/(size_t) m_padded * 2, &m_chunk_n_rows, &n_chunk_n_cols, &vtcm_used)) { - FARF(HIGH, "hmx-mm-id-2d: VTCM too small : m %d k %d n %d budget %zu", m_padded, k, n, vtcm_budget); - return -1; - } - - const size_t weight_area_size = hex_align_up(n_chunk_n_cols * row_stride, HMX_FP16_TILE_SIZE); - const size_t act_area_size = hex_align_up(m_chunk_n_rows * vec_dot_size, HMX_FP16_TILE_SIZE); - const size_t output_area_size = hex_align_up(m_chunk_n_rows * n_chunk_n_cols * sizeof(__fp16), HMX_FP16_TILE_SIZE); - - size_t scratch0_size = hex_align_up(n_chunk_n_cols * vec_dot_size, HMX_FP16_TILE_SIZE); - - uint8_t *vtcm_ptr = (uint8_t *) ctx->vtcm_base; - __fp16 *vtcm_weight = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, weight_area_size); - __fp16 *vtcm_activation = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, act_area_size); - __fp16 *vtcm_output = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, output_area_size); - void *vtcm_scratch0 = vtcm_seq_alloc(&vtcm_ptr, scratch0_size); - __fp16 *vtcm_scales = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, 256); - - vtcm_used = vtcm_ptr - (uint8_t *) ctx->vtcm_base; - if (vtcm_used > vtcm_budget) { - FARF(ERROR, "hmx-mm-id-2d: VTCM overflow: used %zu budget %zu", vtcm_used, vtcm_budget); - return -1; - } - - hmx_init_column_scales(vtcm_scales, Q6_V_vsplat_R(0x3c00)); - - HAP_compute_res_hmx_lock(ctx->vtcm_rctx); - - for (size_t mr = 0; mr < (size_t) m_padded; mr += m_chunk_n_rows) { - const size_t n_rows = hex_smin(m_padded - mr, m_chunk_n_rows); - const size_t n_row_tiles = hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS); - - transfer_activation_chunk_gathered_threaded( - ctx, vtcm_activation, activation, (int) mr, (int) n_rows, k, - matrix_rows, cur_a, mapping_stride, ne11, act_nb1, act_nb2, cne1, num_threads); - - for (size_t nc = 0; nc < (size_t) n; nc += n_chunk_n_cols) { - const size_t n_cols = hex_smin((size_t) n - nc, n_chunk_n_cols); - const size_t n_col_tiles = hmx_ceil_div(n_cols, HMX_FP16_TILE_N_COLS); - - const uint8_t *qweight_chunk = permuted_weight + nc * weight_stride; - dma_queue_push(ctx->dma[0], dma_make_ptr(vtcm_weight, qweight_chunk), row_stride, weight_stride, row_stride, n_cols); - dma_queue_pop(ctx->dma[0]); - - dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_scratch0, vtcm_weight, n_cols, k, row_stride, weight_type, n_k_tiles, n_k_tiles_div, dequant_worker_fn, num_threads); - - core_dot_chunk_fp16(vtcm_output, vtcm_activation, vtcm_scratch0, vtcm_scales, n_row_tiles, n_col_tiles, k / HMX_FP16_TILE_N_ROWS); - - transfer_output_chunk_scattered_threaded( - ctx, dst, vtcm_output, (int) mr, (int) n_rows, (int) n_cols, - matrix_rows, cur_a, mapping_stride, dst_nb1, dst_nb2, cne1, num_threads); - } - } - - HAP_compute_res_hmx_unlock(ctx->vtcm_rctx); - return 0; -} diff --git a/ggml/src/ggml-hexagon/htp/hmx-mm-kernels-tiled.h b/ggml/src/ggml-hexagon/htp/hmx-mm-kernels-tiled.h new file mode 100644 index 000000000000..b7fba22a87f3 --- /dev/null +++ b/ggml/src/ggml-hexagon/htp/hmx-mm-kernels-tiled.h @@ -0,0 +1,1306 @@ +#include "hmx-utils.h" +#include "hmx-queue.h" + +// MXFP4 dequantization LUT: maps 4-bit index to fp16 mantissa value +// kvalues: 0, 0.5, 1, 1.5, 2, 3, 4, 6, 0, -0.5, -1, -1.5, -2, -3, -4, -6 +static const __fp16 mxfp4_to_fp16_lut[64] __attribute__((aligned(VLEN))) = { + 0, 0, 0.5, 0, 1, 0, 1.5, 0, 2, 0, 3, 0, 4, 0, 6, 0, 0, 0, -0.5, 0, -1, 0, -1.5, 0, -2, 0, -3, 0, -4, 0, -6, 0, +}; + +static const __fp16 iq4_nl_to_fp16_lut[64] __attribute__((aligned(VLEN))) = { + -127, 0, -104, 0, -83, 0, -65, 0, -49, 0, -35, 0, -22, 0, -10, 0, + 1, 0, 13, 0, 25, 0, 38, 0, 53, 0, 69, 0, 89, 0, 113, 0, +}; + +// --- tiled format dequantizers --- + +typedef struct { + struct htp_context * ctx; + struct htp_thread_trace * traces; + __fp16 * dst; + const uint8_t * src; + + struct fastdiv_values n_k_tiles_div; + uint32_t n_k_tiles; + uint32_t n_tot_tiles; + uint32_t n_tiles_per_task; + uint32_t tile_size; + uint32_t aligned_tile_size; + uint32_t n_tasks; + uint32_t n_cols; + uint32_t k_block; + size_t row_stride; + uint32_t weight_type; +} tiled_dequantize_state_t; + +// Dequantize a single tile from tiled weight data (already in VTCM) to tile-major FP16. +static void dequantize_tiled_weight_to_fp16_task_q4_0( + const tiled_dequantize_state_t *state, + uint32_t start_tile, uint32_t end_tile) { + + const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F); + const HVX_Vector i8 = Q6_Vb_vsplat_R(8); + + for (uint32_t t = start_tile; t < end_tile; t++) { + const uint8_t * tile_src = state->src + t * state->aligned_tile_size; + __fp16 * dst_ptr = state->dst + t * HTP_MM_HMX_TILE_N_ELMS; + + HVX_Vector v_sc = hvx_vmem(tile_src + 512); + HVX_Vector v_scale_duplicated = Q6_V_lo_W(Q6_W_vshuff_VVR(v_sc, v_sc, -2)); + + // Load all 4 groups in parallel + HVX_Vector vq0 = hvx_vmem(tile_src + 0 * 128); + HVX_Vector vq1 = hvx_vmem(tile_src + 1 * 128); + HVX_Vector vq2 = hvx_vmem(tile_src + 2 * 128); + HVX_Vector vq3 = hvx_vmem(tile_src + 3 * 128); + + // Nibble extraction + HVX_Vector v_lo0 = Q6_V_vand_VV(vq0, mask_h4); + HVX_Vector v_hi0 = Q6_Vub_vlsr_VubR(vq0, 4); + HVX_Vector v_lo1 = Q6_V_vand_VV(vq1, mask_h4); + HVX_Vector v_hi1 = Q6_Vub_vlsr_VubR(vq1, 4); + HVX_Vector v_lo2 = Q6_V_vand_VV(vq2, mask_h4); + HVX_Vector v_hi2 = Q6_Vub_vlsr_VubR(vq2, 4); + HVX_Vector v_lo3 = Q6_V_vand_VV(vq3, mask_h4); + HVX_Vector v_hi3 = Q6_Vub_vlsr_VubR(vq3, 4); + + // Offsetting (-8) + v_lo0 = Q6_Vb_vsub_VbVb(v_lo0, i8); + v_hi0 = Q6_Vb_vsub_VbVb(v_hi0, i8); + v_lo1 = Q6_Vb_vsub_VbVb(v_lo1, i8); + v_hi1 = Q6_Vb_vsub_VbVb(v_hi1, i8); + v_lo2 = Q6_Vb_vsub_VbVb(v_lo2, i8); + v_hi2 = Q6_Vb_vsub_VbVb(v_hi2, i8); + v_lo3 = Q6_Vb_vsub_VbVb(v_lo3, i8); + v_hi3 = Q6_Vb_vsub_VbVb(v_hi3, i8); + + // Shuffling + HVX_VectorPair vp_shuf0 = Q6_W_vshuff_VVR(v_hi0, v_lo0, -1); + HVX_VectorPair vp_shuf1 = Q6_W_vshuff_VVR(v_hi1, v_lo1, -1); + HVX_VectorPair vp_shuf2 = Q6_W_vshuff_VVR(v_hi2, v_lo2, -1); + HVX_VectorPair vp_shuf3 = Q6_W_vshuff_VVR(v_hi3, v_lo3, -1); + + // Unpack to 16-bit + HVX_VectorPair vp_int16_lo0 = Q6_Wh_vunpack_Vb(Q6_V_lo_W(vp_shuf0)); + HVX_VectorPair vp_int16_hi0 = Q6_Wh_vunpack_Vb(Q6_V_hi_W(vp_shuf0)); + HVX_VectorPair vp_int16_lo1 = Q6_Wh_vunpack_Vb(Q6_V_lo_W(vp_shuf1)); + HVX_VectorPair vp_int16_hi1 = Q6_Wh_vunpack_Vb(Q6_V_hi_W(vp_shuf1)); + HVX_VectorPair vp_int16_lo2 = Q6_Wh_vunpack_Vb(Q6_V_lo_W(vp_shuf2)); + HVX_VectorPair vp_int16_hi2 = Q6_Wh_vunpack_Vb(Q6_V_hi_W(vp_shuf2)); + HVX_VectorPair vp_int16_lo3 = Q6_Wh_vunpack_Vb(Q6_V_lo_W(vp_shuf3)); + HVX_VectorPair vp_int16_hi3 = Q6_Wh_vunpack_Vb(Q6_V_hi_W(vp_shuf3)); + + // Convert and scale multiplication + HVX_Vector v_grp0_0 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(Q6_V_lo_W(vp_int16_lo0)), v_scale_duplicated)); + HVX_Vector v_grp0_1 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(Q6_V_hi_W(vp_int16_lo0)), v_scale_duplicated)); + HVX_Vector v_grp0_2 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(Q6_V_lo_W(vp_int16_hi0)), v_scale_duplicated)); + HVX_Vector v_grp0_3 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(Q6_V_hi_W(vp_int16_hi0)), v_scale_duplicated)); + + HVX_Vector v_grp1_0 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(Q6_V_lo_W(vp_int16_lo1)), v_scale_duplicated)); + HVX_Vector v_grp1_1 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(Q6_V_hi_W(vp_int16_lo1)), v_scale_duplicated)); + HVX_Vector v_grp1_2 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(Q6_V_lo_W(vp_int16_hi1)), v_scale_duplicated)); + HVX_Vector v_grp1_3 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(Q6_V_hi_W(vp_int16_hi1)), v_scale_duplicated)); + + HVX_Vector v_grp2_0 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(Q6_V_lo_W(vp_int16_lo2)), v_scale_duplicated)); + HVX_Vector v_grp2_1 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(Q6_V_hi_W(vp_int16_lo2)), v_scale_duplicated)); + HVX_Vector v_grp2_2 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(Q6_V_lo_W(vp_int16_hi2)), v_scale_duplicated)); + HVX_Vector v_grp2_3 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(Q6_V_hi_W(vp_int16_hi2)), v_scale_duplicated)); + + HVX_Vector v_grp3_0 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(Q6_V_lo_W(vp_int16_lo3)), v_scale_duplicated)); + HVX_Vector v_grp3_1 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(Q6_V_hi_W(vp_int16_lo3)), v_scale_duplicated)); + HVX_Vector v_grp3_2 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(Q6_V_lo_W(vp_int16_hi3)), v_scale_duplicated)); + HVX_Vector v_grp3_3 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(Q6_V_hi_W(vp_int16_hi3)), v_scale_duplicated)); + + hvx_vmem(dst_ptr + 0 * 64) = v_grp0_0; + hvx_vmem(dst_ptr + 1 * 64) = v_grp0_1; + hvx_vmem(dst_ptr + 2 * 64) = v_grp0_2; + hvx_vmem(dst_ptr + 3 * 64) = v_grp0_3; + + hvx_vmem(dst_ptr + 4 * 64) = v_grp1_0; + hvx_vmem(dst_ptr + 5 * 64) = v_grp1_1; + hvx_vmem(dst_ptr + 6 * 64) = v_grp1_2; + hvx_vmem(dst_ptr + 7 * 64) = v_grp1_3; + + hvx_vmem(dst_ptr + 8 * 64) = v_grp2_0; + hvx_vmem(dst_ptr + 9 * 64) = v_grp2_1; + hvx_vmem(dst_ptr + 10 * 64) = v_grp2_2; + hvx_vmem(dst_ptr + 11 * 64) = v_grp2_3; + + hvx_vmem(dst_ptr + 12 * 64) = v_grp3_0; + hvx_vmem(dst_ptr + 13 * 64) = v_grp3_1; + hvx_vmem(dst_ptr + 14 * 64) = v_grp3_2; + hvx_vmem(dst_ptr + 15 * 64) = v_grp3_3; + } +} + +static void dequantize_tiled_weight_to_fp16_task_q4_1( + const tiled_dequantize_state_t *state, + uint32_t start_tile, uint32_t end_tile) { + + const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F); + + for (uint32_t t = start_tile; t < end_tile; t++) { + const uint8_t * tile_src = state->src + t * state->aligned_tile_size; + __fp16 * dst_ptr = state->dst + t * HTP_MM_HMX_TILE_N_ELMS; + + HVX_Vector vscale_offset = hvx_vmem(tile_src + 512); + HVX_VectorPair dm_deal = Q6_W_vdeal_VVR(vscale_offset, vscale_offset, -2); + HVX_Vector vd = Q6_V_lo_W(dm_deal); + HVX_Vector vm = Q6_V_hi_W(dm_deal); + + HVX_Vector v_scale_duplicated = Q6_V_lo_W(Q6_W_vshuff_VVR(vd, vd, -2)); + HVX_Vector v_offset_duplicated = Q6_V_lo_W(Q6_W_vshuff_VVR(vm, vm, -2)); + + // Load all 4 groups in parallel + HVX_Vector vq0 = hvx_vmem(tile_src + 0 * 128); + HVX_Vector vq1 = hvx_vmem(tile_src + 1 * 128); + HVX_Vector vq2 = hvx_vmem(tile_src + 2 * 128); + HVX_Vector vq3 = hvx_vmem(tile_src + 3 * 128); + + // Nibble extraction + HVX_Vector v_lo0 = Q6_V_vand_VV(vq0, mask_h4); + HVX_Vector v_hi0 = Q6_Vub_vlsr_VubR(vq0, 4); + HVX_Vector v_lo1 = Q6_V_vand_VV(vq1, mask_h4); + HVX_Vector v_hi1 = Q6_Vub_vlsr_VubR(vq1, 4); + HVX_Vector v_lo2 = Q6_V_vand_VV(vq2, mask_h4); + HVX_Vector v_hi2 = Q6_Vub_vlsr_VubR(vq2, 4); + HVX_Vector v_lo3 = Q6_V_vand_VV(vq3, mask_h4); + HVX_Vector v_hi3 = Q6_Vub_vlsr_VubR(vq3, 4); + + // Shuffling + HVX_VectorPair vp_shuf0 = Q6_W_vshuff_VVR(v_hi0, v_lo0, -1); + HVX_VectorPair vp_shuf1 = Q6_W_vshuff_VVR(v_hi1, v_lo1, -1); + HVX_VectorPair vp_shuf2 = Q6_W_vshuff_VVR(v_hi2, v_lo2, -1); + HVX_VectorPair vp_shuf3 = Q6_W_vshuff_VVR(v_hi3, v_lo3, -1); + + // Unpack to 16-bit + HVX_VectorPair vp_int16_lo0 = Q6_Wh_vunpack_Vb(Q6_V_lo_W(vp_shuf0)); + HVX_VectorPair vp_int16_hi0 = Q6_Wh_vunpack_Vb(Q6_V_hi_W(vp_shuf0)); + HVX_VectorPair vp_int16_lo1 = Q6_Wh_vunpack_Vb(Q6_V_lo_W(vp_shuf1)); + HVX_VectorPair vp_int16_hi1 = Q6_Wh_vunpack_Vb(Q6_V_hi_W(vp_shuf1)); + HVX_VectorPair vp_int16_lo2 = Q6_Wh_vunpack_Vb(Q6_V_lo_W(vp_shuf2)); + HVX_VectorPair vp_int16_hi2 = Q6_Wh_vunpack_Vb(Q6_V_hi_W(vp_shuf2)); + HVX_VectorPair vp_int16_lo3 = Q6_Wh_vunpack_Vb(Q6_V_lo_W(vp_shuf3)); + HVX_VectorPair vp_int16_hi3 = Q6_Wh_vunpack_Vb(Q6_V_hi_W(vp_shuf3)); + + // Convert, multiply, add offset + HVX_Vector v_grp0_0 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vadd_Vqf16Vhf(Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(Q6_V_lo_W(vp_int16_lo0)), v_scale_duplicated), v_offset_duplicated)); + HVX_Vector v_grp0_1 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vadd_Vqf16Vhf(Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(Q6_V_hi_W(vp_int16_lo0)), v_scale_duplicated), v_offset_duplicated)); + HVX_Vector v_grp0_2 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vadd_Vqf16Vhf(Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(Q6_V_lo_W(vp_int16_hi0)), v_scale_duplicated), v_offset_duplicated)); + HVX_Vector v_grp0_3 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vadd_Vqf16Vhf(Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(Q6_V_hi_W(vp_int16_hi0)), v_scale_duplicated), v_offset_duplicated)); + + HVX_Vector v_grp1_0 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vadd_Vqf16Vhf(Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(Q6_V_lo_W(vp_int16_lo1)), v_scale_duplicated), v_offset_duplicated)); + HVX_Vector v_grp1_1 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vadd_Vqf16Vhf(Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(Q6_V_hi_W(vp_int16_lo1)), v_scale_duplicated), v_offset_duplicated)); + HVX_Vector v_grp1_2 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vadd_Vqf16Vhf(Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(Q6_V_lo_W(vp_int16_hi1)), v_scale_duplicated), v_offset_duplicated)); + HVX_Vector v_grp1_3 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vadd_Vqf16Vhf(Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(Q6_V_hi_W(vp_int16_hi1)), v_scale_duplicated), v_offset_duplicated)); + + HVX_Vector v_grp2_0 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vadd_Vqf16Vhf(Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(Q6_V_lo_W(vp_int16_lo2)), v_scale_duplicated), v_offset_duplicated)); + HVX_Vector v_grp2_1 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vadd_Vqf16Vhf(Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(Q6_V_hi_W(vp_int16_lo2)), v_scale_duplicated), v_offset_duplicated)); + HVX_Vector v_grp2_2 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vadd_Vqf16Vhf(Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(Q6_V_lo_W(vp_int16_hi2)), v_scale_duplicated), v_offset_duplicated)); + HVX_Vector v_grp2_3 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vadd_Vqf16Vhf(Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(Q6_V_hi_W(vp_int16_hi2)), v_scale_duplicated), v_offset_duplicated)); + + HVX_Vector v_grp3_0 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vadd_Vqf16Vhf(Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(Q6_V_lo_W(vp_int16_lo3)), v_scale_duplicated), v_offset_duplicated)); + HVX_Vector v_grp3_1 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vadd_Vqf16Vhf(Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(Q6_V_hi_W(vp_int16_lo3)), v_scale_duplicated), v_offset_duplicated)); + HVX_Vector v_grp3_2 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vadd_Vqf16Vhf(Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(Q6_V_lo_W(vp_int16_hi3)), v_scale_duplicated), v_offset_duplicated)); + HVX_Vector v_grp3_3 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vadd_Vqf16Vhf(Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(Q6_V_hi_W(vp_int16_hi3)), v_scale_duplicated), v_offset_duplicated)); + + // Parallel Stores + hvx_vmem(dst_ptr + 0 * 64) = v_grp0_0; + hvx_vmem(dst_ptr + 1 * 64) = v_grp0_1; + hvx_vmem(dst_ptr + 2 * 64) = v_grp0_2; + hvx_vmem(dst_ptr + 3 * 64) = v_grp0_3; + + hvx_vmem(dst_ptr + 4 * 64) = v_grp1_0; + hvx_vmem(dst_ptr + 5 * 64) = v_grp1_1; + hvx_vmem(dst_ptr + 6 * 64) = v_grp1_2; + hvx_vmem(dst_ptr + 7 * 64) = v_grp1_3; + + hvx_vmem(dst_ptr + 8 * 64) = v_grp2_0; + hvx_vmem(dst_ptr + 9 * 64) = v_grp2_1; + hvx_vmem(dst_ptr + 10 * 64) = v_grp2_2; + hvx_vmem(dst_ptr + 11 * 64) = v_grp2_3; + + hvx_vmem(dst_ptr + 12 * 64) = v_grp3_0; + hvx_vmem(dst_ptr + 13 * 64) = v_grp3_1; + hvx_vmem(dst_ptr + 14 * 64) = v_grp3_2; + hvx_vmem(dst_ptr + 15 * 64) = v_grp3_3; + } +} + +static void dequantize_tiled_weight_to_fp16_task_iq4_nl( + const tiled_dequantize_state_t *state, + uint32_t start_tile, uint32_t end_tile) { + + const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F); + const HVX_Vector vlut_cvt = hvx_vmem(iq4_nl_to_fp16_lut); + + for (uint32_t t = start_tile; t < end_tile; t++) { + const uint8_t * tile_src = state->src + t * state->aligned_tile_size; + __fp16 * dst_ptr = state->dst + t * HTP_MM_HMX_TILE_N_ELMS; + + HVX_Vector v_sc = hvx_vmem(tile_src + 512); + HVX_Vector v_scale_duplicated = Q6_V_lo_W(Q6_W_vshuff_VVR(v_sc, v_sc, -2)); + + // Load all 4 groups in parallel + HVX_Vector vq0 = hvx_vmem(tile_src + 0 * 128); + HVX_Vector vq1 = hvx_vmem(tile_src + 1 * 128); + HVX_Vector vq2 = hvx_vmem(tile_src + 2 * 128); + HVX_Vector vq3 = hvx_vmem(tile_src + 3 * 128); + + // Nibble extraction + HVX_Vector v_lo0 = Q6_V_vand_VV(vq0, mask_h4); + HVX_Vector v_hi0 = Q6_Vub_vlsr_VubR(vq0, 4); + HVX_Vector v_lo1 = Q6_V_vand_VV(vq1, mask_h4); + HVX_Vector v_hi1 = Q6_Vub_vlsr_VubR(vq1, 4); + HVX_Vector v_lo2 = Q6_V_vand_VV(vq2, mask_h4); + HVX_Vector v_hi2 = Q6_Vub_vlsr_VubR(vq2, 4); + HVX_Vector v_lo3 = Q6_V_vand_VV(vq3, mask_h4); + HVX_Vector v_hi3 = Q6_Vub_vlsr_VubR(vq3, 4); + + // Shuffling + HVX_VectorPair vp_shuf0 = Q6_W_vshuff_VVR(v_hi0, v_lo0, -1); + HVX_VectorPair vp_shuf1 = Q6_W_vshuff_VVR(v_hi1, v_lo1, -1); + HVX_VectorPair vp_shuf2 = Q6_W_vshuff_VVR(v_hi2, v_lo2, -1); + HVX_VectorPair vp_shuf3 = Q6_W_vshuff_VVR(v_hi3, v_lo3, -1); + + // Shuffle for LUT lookup + HVX_Vector v_q_lo0 = Q6_Vb_vshuff_Vb(Q6_V_lo_W(vp_shuf0)); + HVX_Vector v_q_hi0 = Q6_Vb_vshuff_Vb(Q6_V_hi_W(vp_shuf0)); + HVX_Vector v_q_lo1 = Q6_Vb_vshuff_Vb(Q6_V_lo_W(vp_shuf1)); + HVX_Vector v_q_hi1 = Q6_Vb_vshuff_Vb(Q6_V_hi_W(vp_shuf1)); + HVX_Vector v_q_lo2 = Q6_Vb_vshuff_Vb(Q6_V_lo_W(vp_shuf2)); + HVX_Vector v_q_hi2 = Q6_Vb_vshuff_Vb(Q6_V_hi_W(vp_shuf2)); + HVX_Vector v_q_lo3 = Q6_Vb_vshuff_Vb(Q6_V_lo_W(vp_shuf3)); + HVX_Vector v_q_hi3 = Q6_Vb_vshuff_Vb(Q6_V_hi_W(vp_shuf3)); + + // LUT lookup + HVX_VectorPair vp_lo0 = Q6_Wh_vlut16_VbVhR(v_q_lo0, vlut_cvt, 0); + HVX_VectorPair vp_hi0 = Q6_Wh_vlut16_VbVhR(v_q_hi0, vlut_cvt, 0); + HVX_VectorPair vp_lo1 = Q6_Wh_vlut16_VbVhR(v_q_lo1, vlut_cvt, 0); + HVX_VectorPair vp_hi1 = Q6_Wh_vlut16_VbVhR(v_q_hi1, vlut_cvt, 0); + HVX_VectorPair vp_lo2 = Q6_Wh_vlut16_VbVhR(v_q_lo2, vlut_cvt, 0); + HVX_VectorPair vp_hi2 = Q6_Wh_vlut16_VbVhR(v_q_hi2, vlut_cvt, 0); + HVX_VectorPair vp_lo3 = Q6_Wh_vlut16_VbVhR(v_q_lo3, vlut_cvt, 0); + HVX_VectorPair vp_hi3 = Q6_Wh_vlut16_VbVhR(v_q_hi3, vlut_cvt, 0); + + // Convert and scale multiplication + HVX_Vector v_grp0_0 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_V_lo_W(vp_lo0), v_scale_duplicated)); + HVX_Vector v_grp0_1 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_V_hi_W(vp_lo0), v_scale_duplicated)); + HVX_Vector v_grp0_2 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_V_lo_W(vp_hi0), v_scale_duplicated)); + HVX_Vector v_grp0_3 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_V_hi_W(vp_hi0), v_scale_duplicated)); + + HVX_Vector v_grp1_0 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_V_lo_W(vp_lo1), v_scale_duplicated)); + HVX_Vector v_grp1_1 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_V_hi_W(vp_lo1), v_scale_duplicated)); + HVX_Vector v_grp1_2 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_V_lo_W(vp_hi1), v_scale_duplicated)); + HVX_Vector v_grp1_3 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_V_hi_W(vp_hi1), v_scale_duplicated)); + + HVX_Vector v_grp2_0 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_V_lo_W(vp_lo2), v_scale_duplicated)); + HVX_Vector v_grp2_1 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_V_hi_W(vp_lo2), v_scale_duplicated)); + HVX_Vector v_grp2_2 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_V_lo_W(vp_hi2), v_scale_duplicated)); + HVX_Vector v_grp2_3 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_V_hi_W(vp_hi2), v_scale_duplicated)); + + HVX_Vector v_grp3_0 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_V_lo_W(vp_lo3), v_scale_duplicated)); + HVX_Vector v_grp3_1 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_V_hi_W(vp_lo3), v_scale_duplicated)); + HVX_Vector v_grp3_2 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_V_lo_W(vp_hi3), v_scale_duplicated)); + HVX_Vector v_grp3_3 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_V_hi_W(vp_hi3), v_scale_duplicated)); + + hvx_vmem(dst_ptr + 0 * 64) = v_grp0_0; + hvx_vmem(dst_ptr + 1 * 64) = v_grp0_1; + hvx_vmem(dst_ptr + 2 * 64) = v_grp0_2; + hvx_vmem(dst_ptr + 3 * 64) = v_grp0_3; + + hvx_vmem(dst_ptr + 4 * 64) = v_grp1_0; + hvx_vmem(dst_ptr + 5 * 64) = v_grp1_1; + hvx_vmem(dst_ptr + 6 * 64) = v_grp1_2; + hvx_vmem(dst_ptr + 7 * 64) = v_grp1_3; + + hvx_vmem(dst_ptr + 8 * 64) = v_grp2_0; + hvx_vmem(dst_ptr + 9 * 64) = v_grp2_1; + hvx_vmem(dst_ptr + 10 * 64) = v_grp2_2; + hvx_vmem(dst_ptr + 11 * 64) = v_grp2_3; + + hvx_vmem(dst_ptr + 12 * 64) = v_grp3_0; + hvx_vmem(dst_ptr + 13 * 64) = v_grp3_1; + hvx_vmem(dst_ptr + 14 * 64) = v_grp3_2; + hvx_vmem(dst_ptr + 15 * 64) = v_grp3_3; + } +} + +static void dequantize_tiled_weight_to_fp16_task_mxfp4( + const tiled_dequantize_state_t *state, + uint32_t start_tile, uint32_t end_tile) { + + const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F); + const HVX_Vector vlut_cvt = hvx_vmem(mxfp4_to_fp16_lut); + + for (uint32_t t = start_tile; t < end_tile; t++) { + const uint8_t * tile_src = state->src + t * state->aligned_tile_size; + __fp16 * dst_ptr = state->dst + t * HTP_MM_HMX_TILE_N_ELMS; + + HVX_Vector v = hvx_vmem(tile_src + 512); + HVX_Vector vh = Q6_V_lo_W(Q6_Wuh_vunpack_Vub(v)); + vh = Q6_Vh_vsub_VhVh(vh, Q6_Vh_vsplat_R(112)); + vh = Q6_Vh_vmax_VhVh(vh, Q6_V_vzero()); + vh = Q6_Vh_vmin_VhVh(vh, Q6_Vh_vsplat_R(30)); + vh = Q6_Vh_vasl_VhR(vh, 10); + + HVX_Vector v_scale_duplicated = Q6_V_lo_W(Q6_W_vshuff_VVR(vh, vh, -2)); + + // Load all 4 groups in parallel + HVX_Vector vq0 = hvx_vmem(tile_src + 0 * 128); + HVX_Vector vq1 = hvx_vmem(tile_src + 1 * 128); + HVX_Vector vq2 = hvx_vmem(tile_src + 2 * 128); + HVX_Vector vq3 = hvx_vmem(tile_src + 3 * 128); + + // Nibble extraction + HVX_Vector v_lo0 = Q6_V_vand_VV(vq0, mask_h4); + HVX_Vector v_hi0 = Q6_Vub_vlsr_VubR(vq0, 4); + HVX_Vector v_lo1 = Q6_V_vand_VV(vq1, mask_h4); + HVX_Vector v_hi1 = Q6_Vub_vlsr_VubR(vq1, 4); + HVX_Vector v_lo2 = Q6_V_vand_VV(vq2, mask_h4); + HVX_Vector v_hi2 = Q6_Vub_vlsr_VubR(vq2, 4); + HVX_Vector v_lo3 = Q6_V_vand_VV(vq3, mask_h4); + HVX_Vector v_hi3 = Q6_Vub_vlsr_VubR(vq3, 4); + + // Shuffling + HVX_VectorPair vp_shuf0 = Q6_W_vshuff_VVR(v_hi0, v_lo0, -1); + HVX_VectorPair vp_shuf1 = Q6_W_vshuff_VVR(v_hi1, v_lo1, -1); + HVX_VectorPair vp_shuf2 = Q6_W_vshuff_VVR(v_hi2, v_lo2, -1); + HVX_VectorPair vp_shuf3 = Q6_W_vshuff_VVR(v_hi3, v_lo3, -1); + + // Shuffle for LUT lookup + HVX_Vector v_q_lo0 = Q6_Vb_vshuff_Vb(Q6_V_lo_W(vp_shuf0)); + HVX_Vector v_q_hi0 = Q6_Vb_vshuff_Vb(Q6_V_hi_W(vp_shuf0)); + HVX_Vector v_q_lo1 = Q6_Vb_vshuff_Vb(Q6_V_lo_W(vp_shuf1)); + HVX_Vector v_q_hi1 = Q6_Vb_vshuff_Vb(Q6_V_hi_W(vp_shuf1)); + HVX_Vector v_q_lo2 = Q6_Vb_vshuff_Vb(Q6_V_lo_W(vp_shuf2)); + HVX_Vector v_q_hi2 = Q6_Vb_vshuff_Vb(Q6_V_hi_W(vp_shuf2)); + HVX_Vector v_q_lo3 = Q6_Vb_vshuff_Vb(Q6_V_lo_W(vp_shuf3)); + HVX_Vector v_q_hi3 = Q6_Vb_vshuff_Vb(Q6_V_hi_W(vp_shuf3)); + + // LUT lookup + HVX_VectorPair vp_lo0 = Q6_Wh_vlut16_VbVhR(v_q_lo0, vlut_cvt, 0); + HVX_VectorPair vp_hi0 = Q6_Wh_vlut16_VbVhR(v_q_hi0, vlut_cvt, 0); + HVX_VectorPair vp_lo1 = Q6_Wh_vlut16_VbVhR(v_q_lo1, vlut_cvt, 0); + HVX_VectorPair vp_hi1 = Q6_Wh_vlut16_VbVhR(v_q_hi1, vlut_cvt, 0); + HVX_VectorPair vp_lo2 = Q6_Wh_vlut16_VbVhR(v_q_lo2, vlut_cvt, 0); + HVX_VectorPair vp_hi2 = Q6_Wh_vlut16_VbVhR(v_q_hi2, vlut_cvt, 0); + HVX_VectorPair vp_lo3 = Q6_Wh_vlut16_VbVhR(v_q_lo3, vlut_cvt, 0); + HVX_VectorPair vp_hi3 = Q6_Wh_vlut16_VbVhR(v_q_hi3, vlut_cvt, 0); + + // Convert and scale multiplication + HVX_Vector v_grp0_0 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_V_lo_W(vp_lo0), v_scale_duplicated)); + HVX_Vector v_grp0_1 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_V_hi_W(vp_lo0), v_scale_duplicated)); + HVX_Vector v_grp0_2 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_V_lo_W(vp_hi0), v_scale_duplicated)); + HVX_Vector v_grp0_3 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_V_hi_W(vp_hi0), v_scale_duplicated)); + + HVX_Vector v_grp1_0 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_V_lo_W(vp_lo1), v_scale_duplicated)); + HVX_Vector v_grp1_1 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_V_hi_W(vp_lo1), v_scale_duplicated)); + HVX_Vector v_grp1_2 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_V_lo_W(vp_hi1), v_scale_duplicated)); + HVX_Vector v_grp1_3 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_V_hi_W(vp_hi1), v_scale_duplicated)); + + HVX_Vector v_grp2_0 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_V_lo_W(vp_lo2), v_scale_duplicated)); + HVX_Vector v_grp2_1 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_V_hi_W(vp_lo2), v_scale_duplicated)); + HVX_Vector v_grp2_2 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_V_lo_W(vp_hi2), v_scale_duplicated)); + HVX_Vector v_grp2_3 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_V_hi_W(vp_hi2), v_scale_duplicated)); + + HVX_Vector v_grp3_0 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_V_lo_W(vp_lo3), v_scale_duplicated)); + HVX_Vector v_grp3_1 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_V_hi_W(vp_lo3), v_scale_duplicated)); + HVX_Vector v_grp3_2 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_V_lo_W(vp_hi3), v_scale_duplicated)); + HVX_Vector v_grp3_3 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_V_hi_W(vp_hi3), v_scale_duplicated)); + + hvx_vmem(dst_ptr + 0 * 64) = v_grp0_0; + hvx_vmem(dst_ptr + 1 * 64) = v_grp0_1; + hvx_vmem(dst_ptr + 2 * 64) = v_grp0_2; + hvx_vmem(dst_ptr + 3 * 64) = v_grp0_3; + + hvx_vmem(dst_ptr + 4 * 64) = v_grp1_0; + hvx_vmem(dst_ptr + 5 * 64) = v_grp1_1; + hvx_vmem(dst_ptr + 6 * 64) = v_grp1_2; + hvx_vmem(dst_ptr + 7 * 64) = v_grp1_3; + + hvx_vmem(dst_ptr + 8 * 64) = v_grp2_0; + hvx_vmem(dst_ptr + 9 * 64) = v_grp2_1; + hvx_vmem(dst_ptr + 10 * 64) = v_grp2_2; + hvx_vmem(dst_ptr + 11 * 64) = v_grp2_3; + + hvx_vmem(dst_ptr + 12 * 64) = v_grp3_0; + hvx_vmem(dst_ptr + 13 * 64) = v_grp3_1; + hvx_vmem(dst_ptr + 14 * 64) = v_grp3_2; + hvx_vmem(dst_ptr + 15 * 64) = v_grp3_3; + } +} + +static void dequantize_tiled_weight_to_fp16_task_q8_0( + const tiled_dequantize_state_t *state, + uint32_t start_tile, uint32_t end_tile) { + + for (uint32_t t = start_tile; t < end_tile; t++) { + const uint8_t * tile_src = state->src + t * state->aligned_tile_size; + __fp16 * dst_ptr = state->dst + t * HTP_MM_HMX_TILE_N_ELMS; + + HVX_Vector v_sc = hvx_vmem(tile_src + 1024); + HVX_Vector v_scale_duplicated = Q6_V_lo_W(Q6_W_vshuff_VVR(v_sc, v_sc, -2)); + + // Load groups 0-3 in parallel + HVX_Vector vq0 = hvx_vmem(tile_src + 0 * 128); + HVX_Vector vq1 = hvx_vmem(tile_src + 1 * 128); + HVX_Vector vq2 = hvx_vmem(tile_src + 2 * 128); + HVX_Vector vq3 = hvx_vmem(tile_src + 3 * 128); + + HVX_VectorPair vp_int16_0 = Q6_Wh_vunpack_Vb(vq0); + HVX_VectorPair vp_int16_1 = Q6_Wh_vunpack_Vb(vq1); + HVX_VectorPair vp_int16_2 = Q6_Wh_vunpack_Vb(vq2); + HVX_VectorPair vp_int16_3 = Q6_Wh_vunpack_Vb(vq3); + + // Load groups 4-7 in parallel + HVX_Vector vq4 = hvx_vmem(tile_src + 4 * 128); + HVX_Vector vq5 = hvx_vmem(tile_src + 5 * 128); + HVX_Vector vq6 = hvx_vmem(tile_src + 6 * 128); + HVX_Vector vq7 = hvx_vmem(tile_src + 7 * 128); + + HVX_VectorPair vp_int16_4 = Q6_Wh_vunpack_Vb(vq4); + HVX_VectorPair vp_int16_5 = Q6_Wh_vunpack_Vb(vq5); + HVX_VectorPair vp_int16_6 = Q6_Wh_vunpack_Vb(vq6); + HVX_VectorPair vp_int16_7 = Q6_Wh_vunpack_Vb(vq7); + + // Convert and scale multiply for groups 0-3 + HVX_Vector v_grp0_0 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(Q6_V_lo_W(vp_int16_0)), v_scale_duplicated)); + HVX_Vector v_grp0_1 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(Q6_V_hi_W(vp_int16_0)), v_scale_duplicated)); + HVX_Vector v_grp1_0 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(Q6_V_lo_W(vp_int16_1)), v_scale_duplicated)); + HVX_Vector v_grp1_1 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(Q6_V_hi_W(vp_int16_1)), v_scale_duplicated)); + HVX_Vector v_grp2_0 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(Q6_V_lo_W(vp_int16_2)), v_scale_duplicated)); + HVX_Vector v_grp2_1 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(Q6_V_hi_W(vp_int16_2)), v_scale_duplicated)); + HVX_Vector v_grp3_0 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(Q6_V_lo_W(vp_int16_3)), v_scale_duplicated)); + HVX_Vector v_grp3_1 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(Q6_V_hi_W(vp_int16_3)), v_scale_duplicated)); + + // Store groups 0-3 + hvx_vmem(dst_ptr + 0 * 64) = v_grp0_0; + hvx_vmem(dst_ptr + 1 * 64) = v_grp0_1; + hvx_vmem(dst_ptr + 2 * 64) = v_grp1_0; + hvx_vmem(dst_ptr + 3 * 64) = v_grp1_1; + hvx_vmem(dst_ptr + 4 * 64) = v_grp2_0; + hvx_vmem(dst_ptr + 5 * 64) = v_grp2_1; + hvx_vmem(dst_ptr + 6 * 64) = v_grp3_0; + hvx_vmem(dst_ptr + 7 * 64) = v_grp3_1; + + // Convert and scale multiply for groups 4-7 + HVX_Vector v_grp4_0 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(Q6_V_lo_W(vp_int16_4)), v_scale_duplicated)); + HVX_Vector v_grp4_1 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(Q6_V_hi_W(vp_int16_4)), v_scale_duplicated)); + HVX_Vector v_grp5_0 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(Q6_V_lo_W(vp_int16_5)), v_scale_duplicated)); + HVX_Vector v_grp5_1 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(Q6_V_hi_W(vp_int16_5)), v_scale_duplicated)); + HVX_Vector v_grp6_0 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(Q6_V_lo_W(vp_int16_6)), v_scale_duplicated)); + HVX_Vector v_grp6_1 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(Q6_V_hi_W(vp_int16_6)), v_scale_duplicated)); + HVX_Vector v_grp7_0 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(Q6_V_lo_W(vp_int16_7)), v_scale_duplicated)); + HVX_Vector v_grp7_1 = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(Q6_Vhf_equals_Vh(Q6_V_hi_W(vp_int16_7)), v_scale_duplicated)); + + // Store groups 4-7 + hvx_vmem(dst_ptr + 8 * 64) = v_grp4_0; + hvx_vmem(dst_ptr + 9 * 64) = v_grp4_1; + hvx_vmem(dst_ptr + 10 * 64) = v_grp5_0; + hvx_vmem(dst_ptr + 11 * 64) = v_grp5_1; + hvx_vmem(dst_ptr + 12 * 64) = v_grp6_0; + hvx_vmem(dst_ptr + 13 * 64) = v_grp6_1; + hvx_vmem(dst_ptr + 14 * 64) = v_grp7_0; + hvx_vmem(dst_ptr + 15 * 64) = v_grp7_1; + } +} + +static void convert_f16_weight_to_fp16_tiles_task( + const tiled_dequantize_state_t *state, + uint32_t start_tile, uint32_t end_tile) { + + const uint32_t n_k_tiles = state->n_k_tiles; + const struct fastdiv_values n_k_tiles_div = state->n_k_tiles_div; + + const HVX_Vector v_scat_base = hvx_vmem(hmx_transpose_scatter_offsets); + const HVX_Vector v_scat_step = Q6_V_vsplat_R(4); + const HVX_VectorPred q_mask64 = Q6_Q_vsetq_R(64); + + unsigned ct = fastdiv((unsigned)start_tile, &n_k_tiles_div); + unsigned kt = fastmodulo((unsigned)start_tile, n_k_tiles, &n_k_tiles_div); + + for (unsigned t = start_tile; t < (unsigned)end_tile; ) { + if (kt >= (unsigned)n_k_tiles) { kt = 0; ct++; } + + __fp16 *tile_base = state->dst + t * HTP_MM_HMX_TILE_N_ELMS; + { + uint32_t byte_off = kt * 32 * sizeof(__fp16); + + HVX_Vector v_off = v_scat_base; + for (uint32_t r = 0; r < HTP_MM_HMX_TILE_N_ROWS; r += 2) { + uint32_t row0 = ct * HTP_MM_HMX_TILE_N_COLS + r; + uint32_t row1 = row0 + 1; + + const uint8_t *r0 = state->src + row0 * state->row_stride; + const uint8_t *r1 = state->src + row1 * state->row_stride; + + HVX_Vector v0 = hvx_vmemu((const __fp16 *)(r0 + byte_off)); + HVX_Vector v1 = (row1 < state->n_cols) ? hvx_vmemu((const __fp16 *)(r1 + byte_off)) : Q6_V_vzero(); + + Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_base, HTP_MM_HMX_TILE_SIZE - 1, v_off, v0); + v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step); + Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_base, HTP_MM_HMX_TILE_SIZE - 1, v_off, v1); + v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step); + } + (void) *(volatile HVX_Vector *)(tile_base); + } + ++t; ++kt; + } + + if (start_tile < end_tile) { + (void) *(volatile HVX_Vector *)(state->dst + (end_tile - 1) * HTP_MM_HMX_TILE_N_ELMS); + } +} + +static void quantize_f32_weight_to_fp16_tiles_task( + const tiled_dequantize_state_t *state, + uint32_t start_tile, uint32_t end_tile) { + + const uint32_t n_k_tiles = state->n_k_tiles; + const struct fastdiv_values n_k_tiles_div = state->n_k_tiles_div; + + const HVX_Vector v_scat_base = hvx_vmem(hmx_transpose_scatter_offsets); + const HVX_Vector v_scat_step = Q6_V_vsplat_R(4); + const HVX_VectorPred q_mask64 = Q6_Q_vsetq_R(64); + + unsigned ct = fastdiv((unsigned)start_tile, &n_k_tiles_div); + unsigned kt = fastmodulo((unsigned)start_tile, n_k_tiles, &n_k_tiles_div); + + for (unsigned t = start_tile; t < (unsigned)end_tile; ) { + if (kt >= (unsigned)n_k_tiles) { kt = 0; ct++; } + + __fp16 *tile_base = state->dst + t * HTP_MM_HMX_TILE_N_ELMS; + { + uint32_t byte_off = kt * 32 * sizeof(float); + + HVX_Vector v_off = v_scat_base; + for (uint32_t r = 0; r < HTP_MM_HMX_TILE_N_ROWS; r += 2) { + uint32_t row0 = ct * HTP_MM_HMX_TILE_N_COLS + r; + uint32_t row1 = row0 + 1; + + const uint8_t *r0 = state->src + row0 * state->row_stride; + const uint8_t *r1 = state->src + row1 * state->row_stride; + + HVX_Vector v0_f32 = hvx_vmem((const float *)(r0 + byte_off)); + HVX_Vector v1_f32 = (row1 < state->n_cols) ? hvx_vmem((const float *)(r1 + byte_off)) : Q6_V_vzero(); + + HVX_Vector v_out = hvx_vec_f32_to_f16(v0_f32, v1_f32); + + Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_base, HTP_MM_HMX_TILE_SIZE - 1, v_off, v_out); + v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step); + + HVX_Vector v_out_hi = Q6_V_vror_VR(v_out, 64); + Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_base, HTP_MM_HMX_TILE_SIZE - 1, v_off, v_out_hi); + v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step); + } + (void) *(volatile HVX_Vector *)(tile_base); + } + ++t; ++kt; + } + + if (start_tile < end_tile) { + (void) *(volatile HVX_Vector *)(state->dst + (end_tile - 1) * HTP_MM_HMX_TILE_N_ELMS); + } +} + +// --- End tiled dequantizers --- + +// requires external HMX lock +static void core_dot_chunk_fp16(__fp16 *restrict output, const __fp16 *restrict activation, const __fp16 *restrict weight, const __fp16 *restrict scales, + uint32_t n_row_tiles, uint32_t n_col_tiles, uint32_t n_dot_tiles) { + __builtin_assume(n_row_tiles > 0); + __builtin_assume(n_col_tiles > 0); + __builtin_assume(n_dot_tiles > 0); + + Q6_bias_mxmem2_A((void *)scales); + for (uint32_t r = 0; r < n_row_tiles; ++r) { + for (size_t c = 0; c < n_col_tiles; ++c) { + Q6_mxclracc_hf(); + + const __fp16 *row_tiles = activation + r * n_dot_tiles * HTP_MM_HMX_TILE_N_ELMS; + const __fp16 *col_tiles = weight + c * n_dot_tiles * HTP_MM_HMX_TILE_N_ELMS; + + for (uint32_t k = 0, k_block; k < n_dot_tiles; k += k_block) { + k_block = hex_smin(n_dot_tiles - k, 32); + const uint32_t range = 2048u * (uint32_t)k_block - 1; + Q6_activation_hf_mxmem_RR_deep((unsigned int)row_tiles, range); + Q6_weight_hf_mxmem_RR((unsigned int)col_tiles, range); + row_tiles += k_block * HTP_MM_HMX_TILE_N_ELMS; + col_tiles += k_block * HTP_MM_HMX_TILE_N_ELMS; + } + + __fp16 *out_tile = output + (r * n_col_tiles + c) * HTP_MM_HMX_TILE_N_ELMS; + Q6_mxmem_AR_after_hf(out_tile, 0); + } + } +} + +// C += AB +static void core_mma_chunk_fp16(__fp16 *restrict c, const __fp16 *restrict a, const __fp16 *restrict b, + const __fp16 *restrict col_scales, const __fp16 *restrict eye_tile, + uint32_t n_row_tiles, uint32_t n_col_tiles, uint32_t n_dot_tiles, bool zero_init) { + __builtin_assume(n_row_tiles > 0); + __builtin_assume(n_col_tiles > 0); + __builtin_assume(n_dot_tiles > 0); + + Q6_bias_mxmem2_A((void *)col_scales); + + const size_t dot_tile_stride = n_dot_tiles * HTP_MM_HMX_TILE_N_ELMS; + for (size_t i = 0; i < n_row_tiles; ++i) { + const __fp16 *row_base = a + i * dot_tile_stride; + __fp16 *res_base = c + i * n_col_tiles * HTP_MM_HMX_TILE_N_ELMS; + for (size_t j = 0; j < n_col_tiles; ++j) { + Q6_mxclracc_hf(); + + const __fp16 *col_tiles = b + j * dot_tile_stride; + const __fp16 *row_tiles = row_base; + __fp16 *accum_tile = res_base + j * HTP_MM_HMX_TILE_N_ELMS; + if (!zero_init) { + Q6_activation_hf_mxmem_RR((unsigned int)accum_tile, 2047); + Q6_weight_hf_mxmem_RR((unsigned int)eye_tile, 2047); + } + + for (uint32_t k = 0, k_block; k < n_dot_tiles; k += k_block) { + k_block = hex_smin(n_dot_tiles - k, 32); + const uint32_t range = 2048u * k_block - 1; + Q6_activation_hf_mxmem_RR_deep((unsigned int)row_tiles, range); + Q6_weight_hf_mxmem_RR((unsigned int)col_tiles, range); + row_tiles += k_block * HTP_MM_HMX_TILE_N_ELMS; + col_tiles += k_block * HTP_MM_HMX_TILE_N_ELMS; + } + + Q6_mxmem_AR_after_hf(accum_tile, 0); + } + } +} + +// --- Async HMX matmul job (for pipeline overlap) --- + +typedef struct { + __fp16 * output; + const __fp16 * activation; + const __fp16 * weight; + const __fp16 * scales; + uint32_t n_row_tiles; + uint32_t n_col_tiles; + uint32_t n_dot_tiles; +} hmx_matmul_job_t; + +static void hmx_matmul_worker_fn(void * data) { + hmx_matmul_job_t * job = (hmx_matmul_job_t *) data; + FARF(HIGH, "hmx-mm-job: n_row_tiles %u n_col_tiles %u n_dot_tiles %u", job->n_row_tiles, job->n_col_tiles, job->n_dot_tiles); + core_dot_chunk_fp16(job->output, job->activation, job->weight, job->scales, job->n_row_tiles, job->n_col_tiles, job->n_dot_tiles); +} + +static inline void hmx_matmul_job_init(hmx_matmul_job_t * job, + __fp16 * output, + const __fp16 * activation, + const __fp16 * weight, + const __fp16 * scales, + uint32_t n_row_tiles, + uint32_t n_col_tiles, + uint32_t n_dot_tiles) { + job->output = output; + job->activation = activation; + job->weight = weight; + job->scales = scales; + job->n_row_tiles = n_row_tiles; + job->n_col_tiles = n_col_tiles; + job->n_dot_tiles = n_dot_tiles; +} + +// output : fp16 -> f32p + +static void transfer_output_chunk_fp16_to_fp32(float *restrict dst, const __fp16 *restrict vtcm_src, uint32_t start_row, uint32_t n_rows, uint32_t n_cols, uint32_t dst_stride, uint32_t dst_cols) { + assert(n_cols % HTP_MM_HMX_TILE_N_COLS == 0); + const size_t tile_row_stride = (n_cols / HTP_MM_HMX_TILE_N_COLS) * HTP_MM_HMX_TILE_N_ELMS; + + const HVX_Vector one = hvx_vec_splat_f16(1.0); + + const size_t limit_c = hex_smin(n_cols, dst_cols); + const size_t limit_c_aligned = (limit_c & ~31); + + for (size_t r = 0; r < n_rows; r += 2) { + const size_t r_idx0 = start_row + r + 0; + const size_t r0 = r_idx0 / HTP_MM_HMX_TILE_N_ROWS; + const size_t r1 = (r_idx0 % HTP_MM_HMX_TILE_N_ROWS) / 2; // index of the row pair within the tile + const __fp16 *row_base = vtcm_src + r0 * tile_row_stride; + float *output_row_base = dst + r * dst_stride; // global memory row base for row r (and r+1) + + #pragma unroll(4) + for (size_t c = 0; c < limit_c_aligned; c += HTP_MM_HMX_TILE_N_COLS) { + const size_t c0 = c / HTP_MM_HMX_TILE_N_COLS; + const __fp16 *tile = row_base + c0 * HTP_MM_HMX_TILE_N_ELMS; + HVX_Vector v = ((const HVX_Vector *) tile)[r1]; + HVX_VectorPair vp = Q6_Wqf32_vmpy_VhfVhf(v, one); + + HVX_Vector *pv_out0 = (HVX_Vector *) (output_row_base + c + 0); + HVX_Vector *pv_out1 = (HVX_Vector *) (output_row_base + c + dst_stride); + + *pv_out0 = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(vp)); + if (r + 1 < n_rows) { + *pv_out1 = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(vp)); + } + } + + if (limit_c_aligned < limit_c) { + size_t c = limit_c_aligned; + size_t valid_c = limit_c - c; + const size_t c0 = c / HTP_MM_HMX_TILE_N_COLS; + const __fp16 *tile = row_base + c0 * HTP_MM_HMX_TILE_N_ELMS; + HVX_Vector v = ((const HVX_Vector *) tile)[r1]; + HVX_VectorPair vp = Q6_Wqf32_vmpy_VhfVhf(v, one); + + hvx_vec_store_u(output_row_base + c, valid_c * sizeof(float), Q6_Vsf_equals_Vqf32(Q6_V_lo_W(vp))); + if (r + 1 < n_rows) { + hvx_vec_store_u(output_row_base + c + dst_stride, valid_c * sizeof(float), Q6_Vsf_equals_Vqf32(Q6_V_hi_W(vp))); + } + } + } +} + +typedef struct { + const __fp16 *vtcm_src; + float *dst; + uint32_t n_tasks; + uint32_t n_tot_chunks; + uint32_t n_chunks_per_task; + uint32_t n_cols; + uint32_t dst_stride; // DDR row stride + uint32_t dst_cols; // Actual output columns + struct htp_thread_trace * traces; +} output_transfer_task_state_t; + +// activations : fp32 -> fp16 + +static void transfer_activation_chunk_fp32_to_fp16(__fp16 *restrict vtcm_dst, const float *restrict src, uint32_t n_rows, uint32_t k_block, uint32_t k_stride, uint32_t k_valid) { + const uint32_t n_rows_padded = hex_align_up(n_rows, HTP_MM_HMX_TILE_N_ROWS); + const uint32_t n_rows_tiled = (n_rows / HTP_MM_HMX_TILE_N_ROWS) * HTP_MM_HMX_TILE_N_ROWS; + + uint32_t r = 0; + + #pragma unroll(2) + for (r = 0; r < n_rows_tiled; r += 2) { + uint32_t r0 = r / HTP_MM_HMX_TILE_N_ROWS; // tile row index + uint32_t r1 = r % HTP_MM_HMX_TILE_N_ROWS; // intra-tile row idx + + const float *ptr_in0 = src + (r + 0) * k_stride; + const float *ptr_in1 = src + (r + 1) * k_stride; + + uint32_t c = 0; + for (; c + 32 <= k_valid; c += 32) { + HVX_Vector v0 = *(const HVX_Vector *)(ptr_in0 + c); + HVX_Vector v1 = *(const HVX_Vector *)(ptr_in1 + c); + HVX_Vector v_out = hvx_vec_f32_to_f16_shuff(v0, v1); + + uint32_t c0 = c / HTP_MM_HMX_TILE_N_COLS; // tile column index + uint32_t tile_idx = r0 * (k_block / HTP_MM_HMX_TILE_N_COLS) + c0; + + HVX_Vector *tile = (HVX_Vector *) (vtcm_dst + tile_idx * HTP_MM_HMX_TILE_N_ELMS); + tile[r1 / 2] = v_out; + } + if (c < k_block) { + HVX_Vector v0 = *(const HVX_Vector *)(ptr_in0 + c); + HVX_Vector v1 = *(const HVX_Vector *)(ptr_in1 + c); + + uint32_t rem = k_valid - c; + HVX_VectorPred mask = Q6_Q_vsetq2_R(rem > 0 ? rem * sizeof(float) : 0); + v0 = Q6_V_vmux_QVV(mask, v0, Q6_V_vzero()); + v1 = Q6_V_vmux_QVV(mask, v1, Q6_V_vzero()); + + HVX_Vector v_out = hvx_vec_f32_to_f16_shuff(v0, v1); + + uint32_t c0 = c / HTP_MM_HMX_TILE_N_COLS; // tile column index + uint32_t tile_idx = r0 * (k_block / HTP_MM_HMX_TILE_N_COLS) + c0; + + HVX_Vector *tile = (HVX_Vector *) (vtcm_dst + tile_idx * HTP_MM_HMX_TILE_N_ELMS); + tile[r1 / 2] = v_out; + } + } + + for (; r < n_rows_padded; r += 2) { + uint32_t r0 = r / HTP_MM_HMX_TILE_N_ROWS; // tile row index + uint32_t r1 = r % HTP_MM_HMX_TILE_N_ROWS; // intra-tile row idx + + const bool row0_valid = r < n_rows; + const bool row1_valid = (r + 1) < n_rows; + + const float *ptr_in0 = row0_valid ? (src + (r + 0) * k_stride) : NULL; + const float *ptr_in1 = row1_valid ? (src + (r + 1) * k_stride) : NULL; + + uint32_t c = 0; + for (; c + 32 <= k_valid; c += 32) { + HVX_Vector v0 = Q6_V_vzero(); + HVX_Vector v1 = Q6_V_vzero(); + if (row0_valid) v0 = *(const HVX_Vector *)(ptr_in0 + c); + if (row1_valid) v1 = *(const HVX_Vector *)(ptr_in1 + c); + + HVX_Vector v_out = hvx_vec_f32_to_f16_shuff(v0, v1); + + uint32_t c0 = c / HTP_MM_HMX_TILE_N_COLS; // tile column index + uint32_t tile_idx = r0 * (k_block / HTP_MM_HMX_TILE_N_COLS) + c0; + + HVX_Vector *tile = (HVX_Vector *) (vtcm_dst + tile_idx * HTP_MM_HMX_TILE_N_ELMS); + tile[r1 / 2] = v_out; + } + if (c < k_block) { + HVX_Vector v0 = Q6_V_vzero(); + HVX_Vector v1 = Q6_V_vzero(); + if (row0_valid) v0 = *(const HVX_Vector *)(ptr_in0 + c); + if (row1_valid) v1 = *(const HVX_Vector *)(ptr_in1 + c); + + uint32_t rem = k_valid - c; + HVX_VectorPred mask = Q6_Q_vsetq2_R(rem > 0 ? rem * sizeof(float) : 0); + v0 = Q6_V_vmux_QVV(mask, v0, Q6_V_vzero()); + v1 = Q6_V_vmux_QVV(mask, v1, Q6_V_vzero()); + + HVX_Vector v_out = hvx_vec_f32_to_f16_shuff(v0, v1); + + uint32_t c0 = c / HTP_MM_HMX_TILE_N_COLS; // tile column index + uint32_t tile_idx = r0 * (k_block / HTP_MM_HMX_TILE_N_COLS) + c0; + + HVX_Vector *tile = (HVX_Vector *) (vtcm_dst + tile_idx * HTP_MM_HMX_TILE_N_ELMS); + tile[r1 / 2] = v_out; + } + } +} + +typedef struct { + __fp16 *dst; + const float *src; + uint32_t n_tasks; + uint32_t n_tot_chunks; + uint32_t n_chunks_per_task; + uint32_t k_block; + uint32_t k_stride; + uint32_t k_valid; + struct htp_thread_trace * traces; + struct htp_context * ctx; + float * vtcm_f32_act; +} activation_transfer_task_state_t; + +static void transfer_activation_chunk_fp32_to_fp16_dma_pipelined( + dma_queue *dma_q, + __fp16 *restrict vtcm_dst, + const float *restrict src, + uint32_t n_rows, + uint32_t k_block, + uint32_t k_stride, + uint32_t k_valid, + float *thread_f32_act) { + + const uint32_t R = HTP_MM_DMA_ACT_ROWS_PER_STEP; + const uint32_t n_rows_padded = hex_align_up(n_rows, HTP_MM_HMX_TILE_N_ROWS); + + const uint32_t n_steps = n_rows_padded / R; + + // pre-fetch step 0 + if (n_steps > 0 && n_rows > 0) { + uint32_t nrows_to_fetch = hex_smin(n_rows, R); + dma_queue_push(dma_q, dma_make_ptr(thread_f32_act, src), + k_block * sizeof(float), k_stride * sizeof(float), k_valid * sizeof(float), nrows_to_fetch); + } + + for (uint32_t s = 0; s < n_steps; ++s) { + uint32_t r = R * s; + float *curr_buf = thread_f32_act + (s % 2) * R * k_block; + + if (r < n_rows) { + dma_queue_pop(dma_q); + } + + uint32_t next_s = s + 1; + uint32_t next_r = R * next_s; + if (next_r < n_rows) { + uint32_t nrows_to_fetch = hex_smin(n_rows - next_r, R); + const float *next_src = src + next_r * k_stride; + float *next_buf = thread_f32_act + (next_s % 2) * R * k_block; + dma_queue_push(dma_q, dma_make_ptr(next_buf, next_src), + k_block * sizeof(float), k_stride * sizeof(float), k_valid * sizeof(float), nrows_to_fetch); + } + + #pragma unroll + for (uint32_t i = 0; i < HTP_MM_DMA_ACT_ROWS_PER_STEP; i += 2) { + uint32_t curr_r = r + i; + const bool row0_valid = (curr_r < n_rows); + const bool row1_valid = (curr_r + 1) < n_rows; + + const float *ptr_in0 = curr_buf + i * k_block; + const float *ptr_in1 = curr_buf + (i + 1) * k_block; + + uint32_t c = 0; + for (; c + 32 <= k_valid; c += 32) { + HVX_Vector v0 = Q6_V_vzero(); + HVX_Vector v1 = Q6_V_vzero(); + if (row0_valid) v0 = *(const HVX_Vector *)(ptr_in0 + c); + if (row1_valid) v1 = *(const HVX_Vector *)(ptr_in1 + c); + + HVX_Vector v_out = hvx_vec_f32_to_f16_shuff(v0, v1); + + uint32_t r0 = curr_r / HTP_MM_HMX_TILE_N_ROWS; // tile row index + uint32_t r1 = curr_r % HTP_MM_HMX_TILE_N_ROWS; // intra-tile row idx + uint32_t c0 = c / HTP_MM_HMX_TILE_N_COLS; // tile column index + uint32_t tile_idx = r0 * (k_block / HTP_MM_HMX_TILE_N_COLS) + c0; + + HVX_Vector *tile = (HVX_Vector *) (vtcm_dst + tile_idx * HTP_MM_HMX_TILE_N_ELMS); + tile[r1 / 2] = v_out; + } + if (c < k_block) { + HVX_Vector v0 = Q6_V_vzero(); + HVX_Vector v1 = Q6_V_vzero(); + if (row0_valid) v0 = *(const HVX_Vector *)(ptr_in0 + c); + if (row1_valid) v1 = *(const HVX_Vector *)(ptr_in1 + c); + + uint32_t rem = k_valid - c; + HVX_VectorPred mask = Q6_Q_vsetq2_R(rem > 0 ? rem * sizeof(float) : 0); + v0 = Q6_V_vmux_QVV(mask, v0, Q6_V_vzero()); + v1 = Q6_V_vmux_QVV(mask, v1, Q6_V_vzero()); + + HVX_Vector v_out = hvx_vec_f32_to_f16_shuff(v0, v1); + + uint32_t r0 = curr_r / HTP_MM_HMX_TILE_N_ROWS; // tile row index + uint32_t r1 = curr_r % HTP_MM_HMX_TILE_N_ROWS; // intra-tile row idx + uint32_t c0 = c / HTP_MM_HMX_TILE_N_COLS; // tile column index + uint32_t tile_idx = r0 * (k_block / HTP_MM_HMX_TILE_N_COLS) + c0; + + HVX_Vector *tile = (HVX_Vector *) (vtcm_dst + tile_idx * HTP_MM_HMX_TILE_N_ELMS); + tile[r1 / 2] = v_out; + } + } + } +} + +typedef struct { + const struct mmid_row_mapping *matrix_rows; + __fp16 *dst; + const float *src; + uint32_t n_tasks; + uint32_t n_tot_chunks; + uint32_t n_chunks_per_task; + uint32_t k_block; + uint32_t cur_a; + uint32_t mapping_stride; + uint32_t ne11; + struct fastdiv_values ne11_div; + size_t nb11; + size_t nb12; + uint32_t start_row; + uint32_t cne1; + uint32_t k_valid; + struct htp_thread_trace *traces; +} activation_transfer_gathered_task_state_t; + +typedef struct { + const struct mmid_row_mapping *matrix_rows; + const __fp16 *vtcm_src; + float *dst; + uint32_t n_tasks; + uint32_t n_tot_chunks; + uint32_t n_chunks_per_task; + uint32_t n_cols; + uint32_t cur_a; + uint32_t mapping_stride; + size_t dst_nb1; + size_t dst_nb2; + uint32_t start_row; + uint32_t cne1; + struct htp_thread_trace *traces; +} output_transfer_scattered_task_state_t; + +static void transfer_activation_chunk_fp32_to_fp16_gathered( + __fp16 *restrict vtcm_dst, + const float *restrict src, + uint32_t start_row, + uint32_t n_rows, + uint32_t k_block, + const struct mmid_row_mapping *matrix_rows, + uint32_t cur_a, + uint32_t mapping_stride, + uint32_t ne11, + const struct fastdiv_values * ne11_div, + size_t nb11, + size_t nb12, + uint32_t cne1, + uint32_t k_valid) { + const uint32_t n_rows_padded = hex_align_up(n_rows, HTP_MM_HMX_TILE_N_ROWS); + const uint32_t n_rows_tiled = (n_rows / HTP_MM_HMX_TILE_N_ROWS) * HTP_MM_HMX_TILE_N_ROWS; + + uint32_t r = 0; + + #pragma unroll(2) + for (r = 0; r < n_rows_tiled; r += 2) { + uint32_t r_idx0 = start_row + r + 0; + uint32_t r_idx1 = start_row + r + 1; + uint32_t r0 = r_idx0 / HTP_MM_HMX_TILE_N_ROWS; // tile row index + uint32_t r1 = r_idx0 % HTP_MM_HMX_TILE_N_ROWS; // intra-tile row idx + + struct mmid_row_mapping mapping0 = matrix_rows[cur_a * mapping_stride + r_idx0]; + struct mmid_row_mapping mapping1 = matrix_rows[cur_a * mapping_stride + r_idx1]; + + uint32_t i11_0 = fastmodulo(mapping0.i1, ne11, ne11_div); + uint32_t i11_1 = fastmodulo(mapping1.i1, ne11, ne11_div); + + const float *row0_ptr = (const float *) ((const uint8_t *) src + i11_0 * nb11 + mapping0.i2 * nb12); + const float *row1_ptr = (const float *) ((const uint8_t *) src + i11_1 * nb11 + mapping1.i2 * nb12); + + uint32_t c = 0; + for (; c + 32 <= k_valid; c += 32) { + HVX_Vector v0 = *(const HVX_Vector *)(row0_ptr + c); + HVX_Vector v1 = *(const HVX_Vector *)(row1_ptr + c); + HVX_Vector v_out = hvx_vec_f32_to_f16_shuff(v0, v1); + + uint32_t c0 = c / HTP_MM_HMX_TILE_N_COLS; + uint32_t tile_idx = r0 * (k_block / HTP_MM_HMX_TILE_N_COLS) + c0; + + HVX_Vector *tile = (HVX_Vector *) (vtcm_dst + tile_idx * HTP_MM_HMX_TILE_N_ELMS); + tile[r1 / 2] = v_out; + } + if (c < k_block) { + HVX_Vector v0 = *(const HVX_Vector *)(row0_ptr + c); + HVX_Vector v1 = *(const HVX_Vector *)(row1_ptr + c); + + uint32_t rem = k_valid - c; + HVX_VectorPred mask = Q6_Q_vsetq2_R(rem > 0 ? rem * sizeof(float) : 0); + v0 = Q6_V_vmux_QVV(mask, v0, Q6_V_vzero()); + v1 = Q6_V_vmux_QVV(mask, v1, Q6_V_vzero()); + + HVX_Vector v_out = hvx_vec_f32_to_f16_shuff(v0, v1); + + uint32_t c0 = c / HTP_MM_HMX_TILE_N_COLS; + uint32_t tile_idx = r0 * (k_block / HTP_MM_HMX_TILE_N_COLS) + c0; + + HVX_Vector *tile = (HVX_Vector *) (vtcm_dst + tile_idx * HTP_MM_HMX_TILE_N_ELMS); + tile[r1 / 2] = v_out; + } + } + + for (; r < n_rows_padded; r += 2) { + uint32_t r_idx0 = start_row + r; + uint32_t r0 = r_idx0 / HTP_MM_HMX_TILE_N_ROWS; // tile row index + uint32_t r1 = r_idx0 % HTP_MM_HMX_TILE_N_ROWS; // intra-tile row idx + + const bool row0_valid = (start_row + r + 0) < cne1; + const bool row1_valid = (start_row + r + 1) < cne1; + + const float *row0_ptr = NULL; + const float *row1_ptr = NULL; + + if (row0_valid) { + struct mmid_row_mapping mapping0 = matrix_rows[cur_a * mapping_stride + (start_row + r + 0)]; + uint32_t i11_0 = fastmodulo(mapping0.i1, ne11, ne11_div); + row0_ptr = (const float *) ((const uint8_t *) src + i11_0 * nb11 + mapping0.i2 * nb12); + } + if (row1_valid) { + struct mmid_row_mapping mapping1 = matrix_rows[cur_a * mapping_stride + (start_row + r + 1)]; + uint32_t i11_1 = fastmodulo(mapping1.i1, ne11, ne11_div); + row1_ptr = (const float *) ((const uint8_t *) src + i11_1 * nb11 + mapping1.i2 * nb12); + } + + uint32_t c = 0; + for (; c + 32 <= k_valid; c += 32) { + HVX_Vector v0 = Q6_V_vzero(); + HVX_Vector v1 = Q6_V_vzero(); + if (row0_valid) v0 = *(const HVX_Vector *)(row0_ptr + c); + if (row1_valid) v1 = *(const HVX_Vector *)(row1_ptr + c); + + HVX_Vector v_out = hvx_vec_f32_to_f16_shuff(v0, v1); + + uint32_t c0 = c / HTP_MM_HMX_TILE_N_COLS; + uint32_t tile_idx = r0 * (k_block / HTP_MM_HMX_TILE_N_COLS) + c0; + + HVX_Vector *tile = (HVX_Vector *) (vtcm_dst + tile_idx * HTP_MM_HMX_TILE_N_ELMS); + tile[r1 / 2] = v_out; + } + if (c < k_block) { + HVX_Vector v0 = Q6_V_vzero(); + HVX_Vector v1 = Q6_V_vzero(); + if (row0_valid) v0 = *(const HVX_Vector *)(row0_ptr + c); + if (row1_valid) v1 = *(const HVX_Vector *)(row1_ptr + c); + + uint32_t rem = k_valid - c; + HVX_VectorPred mask = Q6_Q_vsetq2_R(rem > 0 ? rem * sizeof(float) : 0); + v0 = Q6_V_vmux_QVV(mask, v0, Q6_V_vzero()); + v1 = Q6_V_vmux_QVV(mask, v1, Q6_V_vzero()); + + HVX_Vector v_out = hvx_vec_f32_to_f16_shuff(v0, v1); + + uint32_t c0 = c / HTP_MM_HMX_TILE_N_COLS; + uint32_t tile_idx = r0 * (k_block / HTP_MM_HMX_TILE_N_COLS) + c0; + + HVX_Vector *tile = (HVX_Vector *) (vtcm_dst + tile_idx * HTP_MM_HMX_TILE_N_ELMS); + tile[r1 / 2] = v_out; + } + } +} + +static void transfer_activation_chunk_fp32_to_fp16_gathered_flat( + __fp16 *restrict vtcm_dst, + const float *restrict src, + uint32_t start_row, + uint32_t n_rows, + uint32_t k_block, + const struct mmid_row_mapping *matrix_rows, + uint32_t cur_a, + uint32_t mapping_stride, + size_t nb12, + uint32_t cne1, + uint32_t k_valid) { + const uint32_t n_rows_padded = hex_align_up(n_rows, HTP_MM_HMX_TILE_N_ROWS); + const uint32_t n_rows_tiled = (n_rows / HTP_MM_HMX_TILE_N_ROWS) * HTP_MM_HMX_TILE_N_ROWS; + + uint32_t r = 0; + + #pragma unroll(2) + for (r = 0; r < n_rows_tiled; r += 2) { + uint32_t r_idx0 = start_row + r + 0; + uint32_t r_idx1 = start_row + r + 1; + uint32_t r0 = r_idx0 / HTP_MM_HMX_TILE_N_ROWS; // tile row index + uint32_t r1 = r_idx0 % HTP_MM_HMX_TILE_N_ROWS; // intra-tile row idx + + struct mmid_row_mapping mapping0 = matrix_rows[cur_a * mapping_stride + r_idx0]; + struct mmid_row_mapping mapping1 = matrix_rows[cur_a * mapping_stride + r_idx1]; + + const float *row0_ptr = (const float *) ((const uint8_t *) src + mapping0.i2 * nb12); + const float *row1_ptr = (const float *) ((const uint8_t *) src + mapping1.i2 * nb12); + + uint32_t c = 0; + for (; c + 32 <= k_valid; c += 32) { + HVX_Vector v0 = *(const HVX_Vector *)(row0_ptr + c); + HVX_Vector v1 = *(const HVX_Vector *)(row1_ptr + c); + HVX_Vector v_out = hvx_vec_f32_to_f16_shuff(v0, v1); + + uint32_t c0 = c / HTP_MM_HMX_TILE_N_COLS; + uint32_t tile_idx = r0 * (k_block / HTP_MM_HMX_TILE_N_COLS) + c0; + + HVX_Vector *tile = (HVX_Vector *) (vtcm_dst + tile_idx * HTP_MM_HMX_TILE_N_ELMS); + tile[r1 / 2] = v_out; + } + if (c < k_block) { + HVX_Vector v0 = *(const HVX_Vector *)(row0_ptr + c); + HVX_Vector v1 = *(const HVX_Vector *)(row1_ptr + c); + + uint32_t rem = k_valid - c; + HVX_VectorPred mask = Q6_Q_vsetq2_R(rem > 0 ? rem * sizeof(float) : 0); + v0 = Q6_V_vmux_QVV(mask, v0, Q6_V_vzero()); + v1 = Q6_V_vmux_QVV(mask, v1, Q6_V_vzero()); + + HVX_Vector v_out = hvx_vec_f32_to_f16_shuff(v0, v1); + + uint32_t c0 = c / HTP_MM_HMX_TILE_N_COLS; + uint32_t tile_idx = r0 * (k_block / HTP_MM_HMX_TILE_N_COLS) + c0; + + HVX_Vector *tile = (HVX_Vector *) (vtcm_dst + tile_idx * HTP_MM_HMX_TILE_N_ELMS); + tile[r1 / 2] = v_out; + } + } + + for (; r < n_rows_padded; r += 2) { + uint32_t r_idx0 = start_row + r; + uint32_t r0 = r_idx0 / HTP_MM_HMX_TILE_N_ROWS; // tile row index + uint32_t r1 = r_idx0 % HTP_MM_HMX_TILE_N_ROWS; // intra-tile row idx + + const bool row0_valid = (start_row + r + 0) < cne1; + const bool row1_valid = (start_row + r + 1) < cne1; + + const float *row0_ptr = NULL; + const float *row1_ptr = NULL; + + if (row0_valid) { + struct mmid_row_mapping mapping0 = matrix_rows[cur_a * mapping_stride + (start_row + r + 0)]; + row0_ptr = (const float *) ((const uint8_t *) src + mapping0.i2 * nb12); + } + if (row1_valid) { + struct mmid_row_mapping mapping1 = matrix_rows[cur_a * mapping_stride + (start_row + r + 1)]; + row1_ptr = (const float *) ((const uint8_t *) src + mapping1.i2 * nb12); + } + + uint32_t c = 0; + for (; c + 32 <= k_valid; c += 32) { + HVX_Vector v0 = Q6_V_vzero(); + HVX_Vector v1 = Q6_V_vzero(); + if (row0_valid) v0 = *(const HVX_Vector *)(row0_ptr + c); + if (row1_valid) v1 = *(const HVX_Vector *)(row1_ptr + c); + + HVX_Vector v_out = hvx_vec_f32_to_f16_shuff(v0, v1); + + uint32_t c0 = c / HTP_MM_HMX_TILE_N_COLS; + uint32_t tile_idx = r0 * (k_block / HTP_MM_HMX_TILE_N_COLS) + c0; + + HVX_Vector *tile = (HVX_Vector *) (vtcm_dst + tile_idx * HTP_MM_HMX_TILE_N_ELMS); + tile[r1 / 2] = v_out; + } + if (c < k_block) { + HVX_Vector v0 = Q6_V_vzero(); + HVX_Vector v1 = Q6_V_vzero(); + if (row0_valid) v0 = *(const HVX_Vector *)(row0_ptr + c); + if (row1_valid) v1 = *(const HVX_Vector *)(row1_ptr + c); + + uint32_t rem = k_valid - c; + HVX_VectorPred mask = Q6_Q_vsetq2_R(rem > 0 ? rem * sizeof(float) : 0); + v0 = Q6_V_vmux_QVV(mask, v0, Q6_V_vzero()); + v1 = Q6_V_vmux_QVV(mask, v1, Q6_V_vzero()); + + HVX_Vector v_out = hvx_vec_f32_to_f16_shuff(v0, v1); + + uint32_t c0 = c / HTP_MM_HMX_TILE_N_COLS; + uint32_t tile_idx = r0 * (k_block / HTP_MM_HMX_TILE_N_COLS) + c0; + + HVX_Vector *tile = (HVX_Vector *) (vtcm_dst + tile_idx * HTP_MM_HMX_TILE_N_ELMS); + tile[r1 / 2] = v_out; + } + } +} + +static void transfer_output_chunk_fp16_to_fp32_scattered( + float *restrict dst, + const __fp16 *restrict vtcm_src, + uint32_t start_row, + uint32_t n_rows, + uint32_t n_cols, + const struct mmid_row_mapping *matrix_rows, + uint32_t cur_a, + uint32_t mapping_stride, + size_t dst_nb1, + size_t dst_nb2, + uint32_t cne1) { + assert(n_cols % HTP_MM_HMX_TILE_N_COLS == 0); + const size_t tile_row_stride = (n_cols / HTP_MM_HMX_TILE_N_COLS) * HTP_MM_HMX_TILE_N_ELMS; + + const HVX_Vector one = hvx_vec_splat_f16(1.0); + + for (size_t r = 0; r < n_rows; r += 2) { + uint32_t r_idx0 = start_row + r + 0; + uint32_t r_idx1 = start_row + r + 1; + const size_t r0 = r_idx0 / HTP_MM_HMX_TILE_N_ROWS; + const size_t r1 = (r_idx0 % HTP_MM_HMX_TILE_N_ROWS) / 2; // index of the row pair within the tile + const __fp16 *row_base = vtcm_src + r0 * tile_row_stride; + + if (r_idx0 >= cne1) break; + + struct mmid_row_mapping mapping0 = matrix_rows[cur_a * mapping_stride + r_idx0]; + float *output_row0 = (float *) ((uint8_t *) dst + mapping0.i1 * dst_nb1 + mapping0.i2 * dst_nb2); + + float *output_row1 = NULL; + if (r_idx1 < cne1) { + struct mmid_row_mapping mapping1 = matrix_rows[cur_a * mapping_stride + r_idx1]; + output_row1 = (float *) ((uint8_t *) dst + mapping1.i1 * dst_nb1 + mapping1.i2 * dst_nb2); + } + + #pragma unroll(4) + for (size_t c = 0; c < (size_t)n_cols; c += HTP_MM_HMX_TILE_N_COLS) { + const size_t c0 = c / HTP_MM_HMX_TILE_N_COLS; + const __fp16 *tile = row_base + c0 * HTP_MM_HMX_TILE_N_ELMS; + HVX_Vector v = ((const HVX_Vector *) tile)[r1]; + HVX_VectorPair vp = Q6_Wqf32_vmpy_VhfVhf(v, one); + + HVX_Vector *pv_out0 = (HVX_Vector *) (output_row0 + c); + HVX_Vector *pv_out1 = output_row1 ? (HVX_Vector *) (output_row1 + c) : NULL; + + *pv_out0 = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(vp)); + if (pv_out1) { + *pv_out1 = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(vp)); + } + } + } +} diff --git a/ggml/src/ggml-hexagon/htp/hmx-ops.c b/ggml/src/ggml-hexagon/htp/hmx-ops.c deleted file mode 100644 index 114d8c14811c..000000000000 --- a/ggml/src/ggml-hexagon/htp/hmx-ops.c +++ /dev/null @@ -1,6 +0,0 @@ -// HMX operations compiled as a single translation unit. -// This allows interprocedural optimizations within HMX ops without requiring global HTP LTO. - -#include "hmx-queue.c" -#include "hmx-matmul-ops.c" -#include "hmx-flash-attn-ops.c" diff --git a/ggml/src/ggml-hexagon/htp/hmx-ops.h b/ggml/src/ggml-hexagon/htp/hmx-ops.h deleted file mode 100644 index a67842f3ffcf..000000000000 --- a/ggml/src/ggml-hexagon/htp/hmx-ops.h +++ /dev/null @@ -1,88 +0,0 @@ -// HMX operation entry-point declarations. -// Ported from htp-ops-lib/include/dsp/ops.h (renamed, benchmark kernels removed). (https://github.com/haozixu/htp-ops-lib) - -#ifndef HMX_OPS_H -#define HMX_OPS_H - -#include -#include - -#include "htp-ops.h" - -#ifdef __cplusplus -extern "C" { -#endif - -typedef struct { - float *dst; - const float *activation; - const __fp16 *permuted_weight; - int m; - int k; - int n; - int act_stride; - int weight_stride; - int dst_stride; - int ne02; - int ne03; - int ne12; - int ne13; - size_t src0_nb2; - size_t src0_nb3; - size_t src1_nb2; - size_t src1_nb3; - size_t dst_nb2; - size_t dst_nb3; -} hmx_matmul_f16_f32_batched_params_t; - -// HMX matrix multiplication — tile-permuted FP16 weights, FP32 activation/output -// act_stride: activation row stride in elements (= k for contiguous, or -// nb[1]/sizeof(float) for permuted tensors like attention Q). -// weight_stride: weight row stride in elements (= k for compact weights, or -// nb[1]/sizeof(__fp16) for permuted KV-cache views used by QK). -int hmx_matmul_f16_f32(struct htp_context *ctx, - float *restrict dst, - const float *activation, - const __fp16 *permuted_weight, - int m, int k, int n, - int act_stride, - int weight_stride); - -// Batched F16 wrapper over hmx_mat_mul_f16_f32. -// Batch semantics match ggml_mul_mat(): src0 broadcasts to src1 in dims 2/3. -int hmx_matmul_f16_f32_batched(struct htp_context *ctx, const hmx_matmul_f16_f32_batched_params_t *params); - -// HMX matrix multiplication — all supported weight types (F16/F32/Q4_0/Q4_1/Q8_0/IQ4_NL/MXFP4) -int hmx_matmul_2d_f32(struct htp_context *ctx, - float *restrict dst, - const float *activation, - const uint8_t *permuted_weight, - int m, int k, int n, - int act_stride, - int weight_stride, - int weight_type); - -struct mmid_row_mapping; - -int hmx_matmul_id_2d_f32(struct htp_context *ctx, - float *restrict dst, - const float *activation, - const uint8_t *permuted_weight, - int m, int k, int n, - int ne11, - size_t act_nb1, size_t act_nb2, - size_t dst_nb1, size_t dst_nb2, - int weight_stride, - int weight_type, - const struct mmid_row_mapping *matrix_rows, - int cur_a, - int mapping_stride); - -// HMX flash attention -int hmx_flash_attn_ext(struct htp_ops_context * octx); - -#ifdef __cplusplus -} -#endif - -#endif // HMX_OPS_H diff --git a/ggml/src/ggml-hexagon/htp/hmx-profile.h b/ggml/src/ggml-hexagon/htp/hmx-profile.h deleted file mode 100644 index 01eece720c57..000000000000 --- a/ggml/src/ggml-hexagon/htp/hmx-profile.h +++ /dev/null @@ -1,34 +0,0 @@ -// Conditional fine-grained profiling macros for HMX operations. -// -// Define ENABLE_PROFILE_TIMERS (via compiler flag or before including this -// header) to instrument sub-operation latencies with HAP qtimer. When the -// macro is not defined the TIMER_* helpers expand to nothing so there is zero -// overhead. -// -// Usage: -// TIMER_DEFINE(my_phase); // declare accumulator variable -// TIMER_START(my_phase); // snapshot start time -// ... work ... -// TIMER_STOP(my_phase); // accumulate elapsed ticks -// FARF(ALWAYS, "my_phase: %lld us", TIMER_US(my_phase)); - -#ifndef HMX_PROFILE_H -#define HMX_PROFILE_H - -#include - -// #define ENABLE_PROFILE_TIMERS - -#if defined(ENABLE_PROFILE_TIMERS) -# define TIMER_DEFINE(name) int64_t name##_ticks = 0 -# define TIMER_START(name) int64_t name##_t0 = HAP_perf_get_qtimer_count() -# define TIMER_STOP(name) name##_ticks += HAP_perf_get_qtimer_count() - name##_t0 -# define TIMER_US(name) HAP_perf_qtimer_count_to_us(name##_ticks) -#else -# define TIMER_DEFINE(name) -# define TIMER_START(name) -# define TIMER_STOP(name) -# define TIMER_US(name) 0LL -#endif - -#endif // HMX_PROFILE_H diff --git a/ggml/src/ggml-hexagon/htp/hmx-queue.c b/ggml/src/ggml-hexagon/htp/hmx-queue.c index 5b1d83a0cbf0..a0007539c561 100644 --- a/ggml/src/ggml-hexagon/htp/hmx-queue.c +++ b/ggml/src/ggml-hexagon/htp/hmx-queue.c @@ -44,7 +44,9 @@ static inline void hmx_queue_process(struct hmx_queue *q, bool* killed) { case HMX_QUEUE_SUSPEND: hmx_unlock(q); break; default: hmx_lock(q); + htp_trace_event_start(q->trace, HTP_TRACE_EVT_HMX_COMP, ir); d->func(d->data); + htp_trace_event_stop(q->trace, HTP_TRACE_EVT_HMX_COMP, ir); break; } diff --git a/ggml/src/ggml-hexagon/htp/hmx-queue.h b/ggml/src/ggml-hexagon/htp/hmx-queue.h index 0d48c280f526..83135cd91d8b 100644 --- a/ggml/src/ggml-hexagon/htp/hmx-queue.h +++ b/ggml/src/ggml-hexagon/htp/hmx-queue.h @@ -11,6 +11,7 @@ #include #include "hex-utils.h" +#include "hex-profile.h" #ifdef __cplusplus extern "C" { @@ -47,6 +48,7 @@ struct hmx_queue { void * stack; uint32_t hap_rctx; bool hmx_locked; + struct htp_thread_trace * trace; }; struct hmx_queue * hmx_queue_create(size_t capacity, uint32_t hap_rctx); diff --git a/ggml/src/ggml-hexagon/htp/htp-ctx.h b/ggml/src/ggml-hexagon/htp/htp-ctx.h index 0f1676f077a6..6ad77d3daa32 100644 --- a/ggml/src/ggml-hexagon/htp/htp-ctx.h +++ b/ggml/src/ggml-hexagon/htp/htp-ctx.h @@ -4,6 +4,7 @@ #include "hex-dma.h" #include "hmx-queue.h" #include "htp-ops.h" +#include "hex-profile.h" #include "worker-pool.h" #include @@ -12,7 +13,9 @@ #include #include +#ifndef HTP_MAX_NTHREADS #define HTP_MAX_NTHREADS 10 +#endif #define HTP_MAX_MMAPS 16 // Memory mapping @@ -41,9 +44,13 @@ struct htp_ops_context { enum htp_op_code op; // FIXME: rename to opcode int32_t op_params[HTP_OP_MAX_PARAMS]; + int32_t kernel_params[HTP_OP_MAX_KERN_PARAMS]; const struct htp_tensor * src[HTP_OP_MAX_INPUTS]; - const struct htp_tensor * dst; + union { + const struct htp_tensor * dst; + const struct htp_tensor * dsts[HTP_OP_MAX_OUTPUTS]; + }; // TODO convert these to an array struct htp_spad src0_spad; @@ -70,6 +77,7 @@ struct htp_context { bool hmx_enabled; bool etm; uint32_t profiler; + struct htp_thread_trace trace[HTP_MAX_NTHREADS + 1]; uint8_t * vtcm_base; size_t vtcm_size; @@ -85,13 +93,13 @@ struct htp_context { struct htp_ops_context octx; -#ifdef HTP_HAS_HMX struct hmx_queue * hmx_queue; // Async HMX queue for pipeline overlap -#endif }; int op_matmul(struct htp_ops_context * octx); int op_matmul_id(struct htp_ops_context * octx); +int op_matmul_qkv(struct htp_ops_context * octx); +int op_matmul_ffn(struct htp_ops_context * octx); int op_binary(struct htp_ops_context * octx); int op_unary(struct htp_ops_context * octx); int op_sum_rows(struct htp_ops_context * octx); diff --git a/ggml/src/ggml-hexagon/htp/htp-ops.h b/ggml/src/ggml-hexagon/htp/htp-ops.h index fa85bf4ca0c4..d04090135787 100644 --- a/ggml/src/ggml-hexagon/htp/htp-ops.h +++ b/ggml/src/ggml-hexagon/htp/htp-ops.h @@ -28,18 +28,19 @@ enum htp_data_type { HTP_TYPE_MXFP4 = 39, // types used internally for repack, dyn.quant, etc - HTP_TYPE_Q4_0x4x2 = 200, - HTP_TYPE_Q4_1x4x2, - HTP_TYPE_Q8_0x4x2, - HTP_TYPE_MXFP4x4x2, + HTP_TYPE_Q4_0_TILED = 200, + HTP_TYPE_Q4_1_TILED, + HTP_TYPE_Q8_0_TILED, + HTP_TYPE_MXFP4_TILED, HTP_TYPE_INVALID }; // Constats for internal types -#define QK_Q4_0x4x2 256 // 4x Q4_0 blocks packed with next 4x Q4_0 blocks (size in bytes 128) -#define QK_Q8_0x4x2 256 // 4x Q8_0 blocks concat with next 4x Q8_0 blocks -#define QK_MXFP4x4x2 256 // 4x MXFP4 blocks concat with next 4x MXFP4 blocks +#define QK_Q4_0_TILED 256 // 32x32 Q4_0 tiled layout +#define QK_Q8_0_TILED 128 // 32x32 Q8_0 tiled layout +#define QK_MXFP4_TILED 256 // 32x32 MXFP4 tiled layout + // Mask to enable various stages of the Ops. @@ -57,6 +58,8 @@ enum htp_op_code { HTP_OP_DIV = 3, HTP_OP_MUL_MAT, HTP_OP_MUL_MAT_ID, + HTP_OP_MUL_MAT_QKV, + HTP_OP_MUL_MAT_FFN, HTP_OP_RMS_NORM, HTP_OP_RMS_NORM_MUL, HTP_OP_UNARY_SILU, @@ -99,7 +102,9 @@ enum htp_op_code { #define HTP_OP_MAX_DIMS 4 // aka GGML_MAX_DIMS #define HTP_OP_MAX_INPUTS 6 // aka GGML_MAX_SRCS +#define HTP_OP_MAX_OUTPUTS 4 #define HTP_OP_MAX_PARAMS 16 // aka GGML_MAX_OP_PARAMS +#define HTP_OP_MAX_KERN_PARAMS 32 #define HTP_OP_MAX_BUFS 16 #define HTP_OP_MAX_REQS 256 @@ -142,14 +147,42 @@ struct htp_op_desc { uint32_t opcode; // GGML/HTP Op uint32_t flags; // Op flags int32_t params[HTP_OP_MAX_PARAMS]; // Params for the op, e.g. epsilon of RMS norm + int32_t kernel_params[HTP_OP_MAX_KERN_PARAMS]; // generic blob for host-precomputed parameters uint16_t src[HTP_OP_MAX_INPUTS]; // Input tensors indices - uint16_t dst; // Output tensor index + uint16_t dst[HTP_OP_MAX_OUTPUTS]; // Output tensor indices + uint16_t pad[2]; // padding to align to 64 bits }; +#ifndef HTP_MAX_NTHREADS +#define HTP_MAX_NTHREADS 10 +#endif + +#define HTP_TRACE_MAX_EVENTS 256 + enum htp_profiler_mode { HTP_PROF_DISABLED = 0, HTP_PROF_BASIC = 1, HTP_PROF_PMU = 2, + HTP_PROF_TRACE = 3, +}; + +enum htp_trace_event_id { + HTP_TRACE_EVT_DMA = 0, + + HTP_TRACE_EVT_HVX_COMP = 20, + HTP_TRACE_EVT_HVX_A_QUANT = 21, + HTP_TRACE_EVT_HVX_A_PREP = 22, + HTP_TRACE_EVT_HVX_W_DEQUANT = 23, + HTP_TRACE_EVT_HVX_W_PREP = 24, + HTP_TRACE_EVT_HVX_O_PROC = 25, + + HTP_TRACE_EVT_HMX_COMP = 40, +}; + +struct htp_trace_desc { + uint32_t cycles; // lower 32-bits of cycle counter + uint16_t id; // Event ID + uint16_t info; // bit 15: is_stop. bits 14-0: tile/chunk index or other metadata. }; #define HTP_PROF_PMU_NCNT 8 @@ -158,8 +191,8 @@ enum htp_profiler_mode { struct htp_prof_desc { uint32_t opcode; // GGML/HTP Op uint32_t usecs; // Number of usec - uint32_t cycles; // Number of cycles - uint32_t pad; // Unused + uint32_t cycles_start; // Start cycle counter + uint32_t cycles_stop; // Stop cycle counter uint32_t pmu[HTP_PROF_PMU_NCNT]; // PMU counters }; @@ -168,7 +201,7 @@ struct htp_opbatch_req { uint32_t n_bufs; // Number of buffers uint32_t n_tensors; // Number of tensors uint32_t n_ops; // Number of ops - uint32_t flags; // unused + uint32_t n_traces; // Number of trace descriptors per thread uint32_t pad; // unused // struct htp_buf_desc bufs[]; -- dspqueue buf 0 // struct htp_tensor tensors[]; -- dspqueue buf 0 @@ -181,7 +214,8 @@ struct htp_opbatch_rsp { uint32_t n_bufs; // Number of buffers uint32_t n_tensors; // Number of tensors uint32_t n_ops; // Number of op profile descriptors - uint32_t pad; // unused + uint32_t n_traces[HTP_MAX_NTHREADS + 1]; + uint8_t pad[8]; // align to 8 bytes // struct htp_prof_desc profs[]; -- dspqueue buf 0 }; diff --git a/ggml/src/ggml-hexagon/htp/htp_iface.idl b/ggml/src/ggml-hexagon/htp/htp_iface.idl index d696a5fba0c1..47693d8b8b24 100644 --- a/ggml/src/ggml-hexagon/htp/htp_iface.idl +++ b/ggml/src/ggml-hexagon/htp/htp_iface.idl @@ -11,12 +11,13 @@ struct htp_iface_pmu_conf { }; interface htp_iface : remote_handle64 { - AEEResult start(in uint32 sess_id, in uint64 dsp_queue_id, in uint32 n_hvx, in uint32 use_hmx, in uint64 max_vmem); + AEEResult start(in uint32 sess_id, in uint64 dsp_queue_id, in uint32 n_hvx, in uint32 n_hmx, in uint64 max_vmem); AEEResult stop(); AEEResult mmap(in uint32 fd, in uint32 size); AEEResult munmap(in uint32 fd); AEEResult profiler(in uint32 mode, in htp_iface_pmu_conf pmu); AEEResult etm(in uint32 enable); + AEEResult hwinfo(rout uint32 n_threads, rout uint32 n_hvx, rout uint32 n_hmx, rout uint64 vtcm_size); }; #endif /* HTP_IDL */ diff --git a/ggml/src/ggml-hexagon/htp/hvx-base.h b/ggml/src/ggml-hexagon/htp/hvx-base.h index f6cb02951d0c..493b26c6e755 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-base.h +++ b/ggml/src/ggml-hexagon/htp/hvx-base.h @@ -170,25 +170,7 @@ static inline HVX_VectorPair hvx_vec_f16_to_f32(HVX_Vector v) { } #endif -/* Q6_Vsf_equals_Vw is only available on v73+.*/ -#if __HVX_ARCH__ < 73 -static inline HVX_Vector hvx_vec_i32_to_qf32(HVX_Vector const in) -{ - HVX_Vector const vzero = Q6_V_vzero(); - HVX_VectorPred is_zero = Q6_Q_vcmp_eq_VwVw(in, vzero); - HVX_Vector lshift = Q6_Vw_vnormamt_Vw(in); - HVX_Vector normalized = Q6_Vw_vasl_VwVw(in, lshift); - HVX_Vector vexp = Q6_Vw_vsub_VwVw(Q6_V_vsplat_R(0x7f + 30), lshift); - HVX_Vector mant = Q6_V_vand_VV(Q6_V_vsplat_R(0xFFFFFF00), normalized); - HVX_Vector ret = Q6_V_vmux_QVV(is_zero, vzero, Q6_Vw_vadd_VwVw(mant, vexp)); - return ret; -} -static inline HVX_Vector Q6_Vsf_equals_Vw(HVX_Vector const in) -{ - return Q6_Vsf_equals_Vqf32(hvx_vec_i32_to_qf32(in)); -} -#endif static inline HVX_Vector hvx_vec_i16_from_hf_rnd_sat(HVX_Vector vin) { // This looks complicated. @@ -305,4 +287,17 @@ static inline HVX_Vector hvx_vec_mul_f32_f32(HVX_Vector a, HVX_Vector b) { #endif // __HVX_ARCH__ < 79 +static inline HVX_Vector hvx_vec_load_act_tile(const uint8_t * y_q, uint32_t kt, HVX_Vector * v_act_all) { + if (kt % 4 == 0) { + *v_act_all = hvx_vmem(y_q + kt * 32); + return *v_act_all; + } else if (kt % 4 == 1) { + return Q6_V_vror_VR(*v_act_all, 32); + } else if (kt % 4 == 2) { + return Q6_V_vror_VR(*v_act_all, 64); + } else { + return Q6_V_vror_VR(*v_act_all, 96); + } +} + #endif /* HVX_BASE_H */ diff --git a/ggml/src/ggml-hexagon/htp/hvx-mm-kernels-flat.h b/ggml/src/ggml-hexagon/htp/hvx-mm-kernels-flat.h new file mode 100644 index 000000000000..52351b1039c5 --- /dev/null +++ b/ggml/src/ggml-hexagon/htp/hvx-mm-kernels-flat.h @@ -0,0 +1,1024 @@ +// Dynamic quantizers that produce flat (non-tiled) activations + +static inline void quantize_block_f32_q8_0_flat( + float * restrict x, + uint8_t * restrict y_quants, + __fp16 * restrict y_scales, + uint32_t block_idx +) { + HVX_Vector * vx = (HVX_Vector *) x; + HVX_Vector zero = Q6_V_vzero(); + + HVX_Vector vmax0_sf = hvx_vec_reduce_max_f32(hvx_vec_abs_f32(vx[0])); + HVX_Vector vmax1_sf = hvx_vec_reduce_max_f32(hvx_vec_abs_f32(vx[1])); + HVX_Vector vmax2_sf = hvx_vec_reduce_max_f32(hvx_vec_abs_f32(vx[2])); + HVX_Vector vmax3_sf = hvx_vec_reduce_max_f32(hvx_vec_abs_f32(vx[3])); + + HVX_Vector vx0_qf = Q6_Vqf32_vsub_VsfVsf(vx[0], zero); + HVX_Vector vx1_qf = Q6_Vqf32_vsub_VsfVsf(vx[1], zero); + HVX_Vector vx2_qf = Q6_Vqf32_vsub_VsfVsf(vx[2], zero); + HVX_Vector vx3_qf = Q6_Vqf32_vsub_VsfVsf(vx[3], zero); + + HVX_Vector vmax0_qf = Q6_Vqf32_vsub_VsfVsf(vmax0_sf, zero); + HVX_Vector vmax1_qf = Q6_Vqf32_vsub_VsfVsf(vmax1_sf, zero); + HVX_Vector vmax2_qf = Q6_Vqf32_vsub_VsfVsf(vmax2_sf, zero); + HVX_Vector vmax3_qf = Q6_Vqf32_vsub_VsfVsf(vmax3_sf, zero); + + HVX_Vector vmax01_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vmax1_qf, vmax0_qf))); + HVX_Vector vmax23_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vmax3_qf, vmax2_qf))); + + HVX_Vector vx01_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vx1_qf, vx0_qf))); + HVX_Vector vx23_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vx3_qf, vx2_qf))); + + HVX_Vector vd01_qf16 = Q6_Vqf16_vmpy_VhfVhf(vmax01_hf, Q6_Vh_vsplat_R(0x2008)); // 1.0 / 127.0 + HVX_Vector vd23_qf16 = Q6_Vqf16_vmpy_VhfVhf(vmax23_hf, Q6_Vh_vsplat_R(0x2008)); // 1.0 / 127.0 + HVX_Vector vd01_hf = Q6_Vhf_equals_Vqf16(vd01_qf16); + HVX_Vector vd23_hf = Q6_Vhf_equals_Vqf16(vd23_qf16); + + HVX_Vector vd01_inv_hf = hvx_vec_inverse_f16(vd01_hf); + HVX_Vector vd23_inv_hf = hvx_vec_inverse_f16(vd23_hf); + vx01_hf = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(vx01_hf, vd01_inv_hf)); + vx23_hf = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(vx23_hf, vd23_inv_hf)); + + HVX_Vector vx01_i16 = hvx_vec_i16_from_hf_rnd_sat(vx01_hf); + HVX_Vector vx23_i16 = hvx_vec_i16_from_hf_rnd_sat(vx23_hf); + HVX_Vector vx_i8 = Q6_Vb_vpack_VhVh_sat(vx23_i16, vx01_i16); + + * (HVX_Vector *) (y_quants + block_idx * 128) = vx_i8; + + HVX_VectorPair vp1 = Q6_W_vshuff_VVR(vd23_hf, vd01_hf, -2); + HVX_VectorPair vp2 = Q6_W_vshuff_VVR(Q6_V_hi_W(vp1), Q6_V_lo_W(vp1), -2); + HVX_Vector v_scales = Q6_V_lo_W(vp2); + hvx_vec_store_u(y_scales + block_idx * 4, 8, v_scales); +} + +static inline void quantize_block_f32_q8_1_flat( + float * restrict x, + uint8_t * restrict y_quants, + __fp16 * restrict y_scales, + uint32_t block_idx +) { + HVX_Vector * vx = (HVX_Vector *) x; + HVX_Vector zero = Q6_V_vzero(); + + HVX_Vector vmax0_sf = hvx_vec_reduce_max_f32(hvx_vec_abs_f32(vx[0])); + HVX_Vector vmax1_sf = hvx_vec_reduce_max_f32(hvx_vec_abs_f32(vx[1])); + HVX_Vector vmax2_sf = hvx_vec_reduce_max_f32(hvx_vec_abs_f32(vx[2])); + HVX_Vector vmax3_sf = hvx_vec_reduce_max_f32(hvx_vec_abs_f32(vx[3])); + + HVX_Vector vx0_qf = Q6_Vqf32_vsub_VsfVsf(vx[0], zero); + HVX_Vector vx1_qf = Q6_Vqf32_vsub_VsfVsf(vx[1], zero); + HVX_Vector vx2_qf = Q6_Vqf32_vsub_VsfVsf(vx[2], zero); + HVX_Vector vx3_qf = Q6_Vqf32_vsub_VsfVsf(vx[3], zero); + + HVX_Vector vmax0_qf = Q6_Vqf32_vsub_VsfVsf(vmax0_sf, zero); + HVX_Vector vmax1_qf = Q6_Vqf32_vsub_VsfVsf(vmax1_sf, zero); + HVX_Vector vmax2_qf = Q6_Vqf32_vsub_VsfVsf(vmax2_sf, zero); + HVX_Vector vmax3_qf = Q6_Vqf32_vsub_VsfVsf(vmax3_sf, zero); + + HVX_Vector vmax01_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vmax1_qf, vmax0_qf))); + HVX_Vector vmax23_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vmax3_qf, vmax2_qf))); + + HVX_Vector vx01_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vx1_qf, vx0_qf))); + HVX_Vector vx23_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vx3_qf, vx2_qf))); + + HVX_Vector vd01_qf16 = Q6_Vqf16_vmpy_VhfVhf(vmax01_hf, Q6_Vh_vsplat_R(0x2008)); // 1.0 / 127.0 + HVX_Vector vd23_qf16 = Q6_Vqf16_vmpy_VhfVhf(vmax23_hf, Q6_Vh_vsplat_R(0x2008)); // 1.0 / 127.0 + HVX_Vector vd01_hf = Q6_Vhf_equals_Vqf16(vd01_qf16); + HVX_Vector vd23_hf = Q6_Vhf_equals_Vqf16(vd23_qf16); + + HVX_Vector vd01_inv_hf = hvx_vec_inverse_f16(vd01_hf); + HVX_Vector vd23_inv_hf = hvx_vec_inverse_f16(vd23_hf); + vx01_hf = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(vx01_hf, vd01_inv_hf)); + vx23_hf = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(vx23_hf, vd23_inv_hf)); + + HVX_Vector vx01_i16 = hvx_vec_i16_from_hf_rnd_sat(vx01_hf); + HVX_Vector vx23_i16 = hvx_vec_i16_from_hf_rnd_sat(vx23_hf); + HVX_Vector vx_i8 = Q6_Vb_vpack_VhVh_sat(vx23_i16, vx01_i16); + + const HVX_Vector ones = Q6_Vb_vsplat_R(1); + HVX_Vector v_sums = Q6_Vw_vrmpy_VbVb(vx_i8, ones); + v_sums = Q6_Vw_vadd_VwVw(v_sums, Q6_V_vror_VR(v_sums, 4)); + v_sums = Q6_Vw_vadd_VwVw(v_sums, Q6_V_vror_VR(v_sums, 8)); + v_sums = Q6_Vw_vadd_VwVw(v_sums, Q6_V_vror_VR(v_sums, 16)); + + * (HVX_Vector *) (y_quants + block_idx * 128) = vx_i8; + + HVX_VectorPair vp1 = Q6_W_vshuff_VVR(vd23_hf, vd01_hf, -2); + HVX_VectorPair vp2 = Q6_W_vshuff_VVR(Q6_V_hi_W(vp1), Q6_V_lo_W(vp1), -2); + HVX_Vector v_scales = Q6_V_lo_W(vp2); + + HVX_VectorPair v_deal1 = Q6_W_vdeal_VVR(v_sums, v_sums, -4); + HVX_Vector v_even1 = Q6_V_lo_W(v_deal1); + HVX_VectorPair v_deal2 = Q6_W_vdeal_VVR(v_even1, v_even1, -4); + HVX_Vector v_even2 = Q6_V_lo_W(v_deal2); + HVX_VectorPair v_deal3 = Q6_W_vdeal_VVR(v_even2, v_even2, -4); + HVX_Vector v_sums_shuffled = Q6_V_lo_W(v_deal3); + + HVX_Vector v_sums_sf = Q6_Vsf_equals_Vw(v_sums_shuffled); + HVX_Vector v_sums_hf = hvx_vec_f32_to_f16(v_sums_sf, Q6_V_vzero()); + + HVX_Vector v_prod = hvx_vec_mul_f16_f16(v_scales, v_sums_hf); + + HVX_VectorPair vp_scales = Q6_W_vshuff_VVR(v_prod, v_scales, -2); + HVX_Vector v_final = Q6_V_lo_W(vp_scales); + + hvx_vec_store_u(y_scales + block_idx * 8, 16, v_final); +} + +static inline void quantize_row_f32_q8_0_flat(float * restrict x, uint8_t * restrict y, uint32_t k) { + assert(k % 32 == 0); + const uint32_t quants_size = hex_round_up(k, 128); + uint8_t * restrict y_quants = y; + __fp16 * restrict y_scales = (__fp16 *) (y + quants_size); + + const uint32_t nb = (k + 127) / 128; + for (uint32_t i = 0; i < nb; i++) { + quantize_block_f32_q8_0_flat(x + i * 128, y_quants, y_scales, i); + } +} + +static inline void quantize_row_f32_q8_1_flat(float * restrict x, uint8_t * restrict y, uint32_t k) { + assert(k % 32 == 0); + const uint32_t quants_size = hex_round_up(k, 128); + uint8_t * restrict y_quants = y; + __fp16 * restrict y_scales = (__fp16 *) (y + quants_size); + + const uint32_t nb = (k + 127) / 128; + for (uint32_t i = 0; i < nb; i++) { + quantize_block_f32_q8_1_flat(x + i * 128, y_quants, y_scales, i); + } +} + +static inline void quantize_f32_q8_0_flat_kernel( + const uint8_t * restrict src_data, + uint8_t * restrict dst_data, + uint8_t * restrict tmp_data, + uint32_t ne0, + uint32_t nrows, + size_t src_row_size, + size_t dst_row_size +) { + const size_t src_row_size_padded = hex_round_up(src_row_size, QK_Q8_0_TILED * sizeof(float)); + hvx_splat_f32_a(tmp_data, 0.0f, src_row_size_padded / sizeof(float)); + + for (uint32_t i = 0; i < nrows; ++i) { + hex_l2fetch(src_data, src_row_size, src_row_size, 2); + hvx_copy_f32_aa(tmp_data, src_data, ne0); + + quantize_row_f32_q8_0_flat((float *) tmp_data, dst_data, ne0); + dst_data += dst_row_size; + src_data += src_row_size; + } +} + +static inline void quantize_f32_q8_1_flat_kernel( + const uint8_t * restrict src_data, + uint8_t * restrict dst_data, + uint8_t * restrict tmp_data, + uint32_t ne0, + uint32_t nrows, + size_t src_row_size, + size_t dst_row_size +) { + const size_t src_row_size_padded = hex_round_up(src_row_size, QK_Q8_0_TILED * sizeof(float)); + hvx_splat_f32_a(tmp_data, 0.0f, src_row_size_padded / sizeof(float)); + + for (uint32_t i = 0; i < nrows; ++i) { + hex_l2fetch(src_data, src_row_size, src_row_size, 2); + hvx_copy_f32_aa(tmp_data, src_data, ne0); + + quantize_row_f32_q8_1_flat((float *) tmp_data, dst_data, ne0); + dst_data += dst_row_size; + src_data += src_row_size; + } +} + +static inline void quantize_f32_f32_flat_kernel( + const uint8_t * restrict src_data, + uint8_t * restrict dst_data, + uint8_t * restrict tmp_data, + uint32_t ne0, + uint32_t nrows, + size_t src_stride, + size_t dst_stride +) { + (void) tmp_data; + const size_t src_row_size = ne0 * sizeof(float); + for (uint32_t i = 0; i < nrows; ++i) { + hex_l2fetch(src_data, src_row_size, src_stride, 2); + hvx_copy_f32_au(dst_data, src_data, ne0); + + dst_data += dst_stride; + src_data += src_stride; + } +} + +static inline void quantize_f32_f16_flat_kernel( + const uint8_t * restrict src_data, + uint8_t * restrict dst_data, + uint8_t * restrict tmp_data, + uint32_t ne0, + uint32_t nrows, + size_t src_stride, + size_t dst_stride +) { + (void) tmp_data; + const size_t src_row_size = ne0 * sizeof(float); + for (uint32_t i = 0; i < nrows; ++i) { + hex_l2fetch(src_data, src_row_size, src_stride, 2); + hvx_copy_f16_f32_au(dst_data, src_data, ne0); + + dst_data += dst_stride; + src_data += src_stride; + } +} + +static inline void quantize_f16_f16_flat_kernel( + const uint8_t * restrict src_data, + uint8_t * restrict dst_data, + uint8_t * restrict tmp_data, + uint32_t ne0, + uint32_t nrows, + size_t src_stride, + size_t dst_stride +) { + (void) tmp_data; + const size_t src_row_size = ne0 * sizeof(float); + for (uint32_t i = 0; i < nrows; ++i) { + hex_l2fetch(src_data, src_row_size, src_stride, 2); + hvx_copy_f16_au(dst_data, src_data, ne0); + + dst_data += dst_stride; + src_data += src_stride; + } +} + +// Dot kernels that consume flat (non-tiled) activations + +static void flat_vec_dot_q4_0_32x1(const uint32_t n, float * restrict s, const void * restrict vx, const void * restrict vy, uint32_t valid_rows) { + const uint8_t * restrict tile_ptr = vx; + const uint8_t * restrict y_q = vy; + + HVX_Vector v_sum_float = Q6_V_vzero(); + HVX_Vector i8 = Q6_Vb_vsplat_R(8); + + static const uint8_t __attribute__((aligned(128))) repl[128] = { + 0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x20, 0x20, 0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x40, 0x40, 0x40, 0x40, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x20, 0x20, 0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + }; + HVX_Vector v_repl_ctrl = * (const HVX_Vector *) repl; + + const uint32_t quants_size = hex_round_up(n, 128); + const __fp16 * restrict y_scales = (const __fp16 *) (y_q + quants_size); + + uint32_t n_k_tiles = n / 32; + for (uint32_t kt = 0; kt < n_k_tiles; kt++) { + const HVX_Vector * restrict vptr = (const HVX_Vector *) (tile_ptr + kt * 640); + + uint32_t block_idx = kt / 4; + uint32_t sub_idx = kt % 4; + + HVX_Vector vx_i8 = * (const HVX_Vector *) (y_q + block_idx * 128); + HVX_Vector v_act_raw = Q6_V_vror_VR(vx_i8, sub_idx * 32); + + HVX_Vector v_act_rep[8]; + v_act_rep[0] = Q6_V_vdelta_VV(v_act_raw, v_repl_ctrl); + v_act_rep[1] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act_raw, 4), v_repl_ctrl); + v_act_rep[2] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act_raw, 8), v_repl_ctrl); + v_act_rep[3] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act_raw, 12), v_repl_ctrl); + v_act_rep[4] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act_raw, 16), v_repl_ctrl); + v_act_rep[5] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act_raw, 20), v_repl_ctrl); + v_act_rep[6] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act_raw, 24), v_repl_ctrl); + v_act_rep[7] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act_raw, 28), v_repl_ctrl); + + HVX_Vector v_sum = accum_4bit_32x1(vptr, v_act_rep, i8); + HVX_Vector v_sum_sf = Q6_Vsf_equals_Vw(v_sum); + + HVX_Vector v_scale_w = vptr[4]; + + __fp16 scale_a_val = y_scales[kt]; + HVX_Vector v_scale_a = hvx_vec_repl_f16(Q6_Vh_vsplat_R(*(const int16_t *)&scale_a_val)); + + HVX_Vector v_scale_comb = hvx_vec_mul_f16_f16_to_f32_lower32(v_scale_w, v_scale_a); + HVX_Vector v_sum_scaled = hvx_vec_mul_f32_f32(v_sum_sf, v_scale_comb); + + v_sum_float = hvx_vec_add_f32_f32(v_sum_float, v_sum_scaled); + } + + hvx_vec_store_u(s, valid_rows * sizeof(float), v_sum_float); +} + +static void flat_vec_dot_q4_0_32x2(const uint32_t n, float * restrict s0, float * restrict s1, const void * restrict vx, const void * restrict vy0, const void * restrict vy1, uint32_t valid_rows) { + const uint8_t * restrict tile_ptr = vx; + const uint8_t * restrict y0_q = vy0; + const uint8_t * restrict y1_q = vy1; + + HVX_Vector v_sum_float_c0 = Q6_V_vzero(); + HVX_Vector v_sum_float_c1 = Q6_V_vzero(); + HVX_Vector i8 = Q6_Vb_vsplat_R(8); + + static const uint8_t __attribute__((aligned(128))) repl[128] = { + 0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x20, 0x20, 0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x40, 0x40, 0x40, 0x40, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x20, 0x20, 0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + }; + HVX_Vector v_repl_ctrl = * (const HVX_Vector *) repl; + + const uint32_t quants_size = hex_round_up(n, 128); + const __fp16 * restrict y0_scales = (const __fp16 *) (y0_q + quants_size); + const __fp16 * restrict y1_scales = (const __fp16 *) (y1_q + quants_size); + + uint32_t n_k_tiles = n / 32; + for (uint32_t kt = 0; kt < n_k_tiles; kt++) { + const HVX_Vector * restrict vptr = (const HVX_Vector *) (tile_ptr + kt * 640); + + uint32_t block_idx = kt / 4; + uint32_t sub_idx = kt % 4; + + HVX_Vector vx0_i8 = * (const HVX_Vector *) (y0_q + block_idx * 128); + HVX_Vector vx1_i8 = * (const HVX_Vector *) (y1_q + block_idx * 128); + + HVX_Vector v_act0_raw = Q6_V_vror_VR(vx0_i8, sub_idx * 32); + HVX_Vector v_act1_raw = Q6_V_vror_VR(vx1_i8, sub_idx * 32); + + HVX_Vector v_act0_rep[8]; + v_act0_rep[0] = Q6_V_vdelta_VV(v_act0_raw, v_repl_ctrl); + v_act0_rep[1] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act0_raw, 4), v_repl_ctrl); + v_act0_rep[2] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act0_raw, 8), v_repl_ctrl); + v_act0_rep[3] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act0_raw, 12), v_repl_ctrl); + v_act0_rep[4] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act0_raw, 16), v_repl_ctrl); + v_act0_rep[5] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act0_raw, 20), v_repl_ctrl); + v_act0_rep[6] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act0_raw, 24), v_repl_ctrl); + v_act0_rep[7] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act0_raw, 28), v_repl_ctrl); + + HVX_Vector v_act1_rep[8]; + v_act1_rep[0] = Q6_V_vdelta_VV(v_act1_raw, v_repl_ctrl); + v_act1_rep[1] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act1_raw, 4), v_repl_ctrl); + v_act1_rep[2] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act1_raw, 8), v_repl_ctrl); + v_act1_rep[3] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act1_raw, 12), v_repl_ctrl); + v_act1_rep[4] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act1_raw, 16), v_repl_ctrl); + v_act1_rep[5] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act1_raw, 20), v_repl_ctrl); + v_act1_rep[6] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act1_raw, 24), v_repl_ctrl); + v_act1_rep[7] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act1_raw, 28), v_repl_ctrl); + + HVX_VectorPair v_sums = accum_4bit_32x2(vptr, v_act0_rep, v_act1_rep, i8); + HVX_Vector v_sum_c0 = Q6_V_lo_W(v_sums); + HVX_Vector v_sum_c1 = Q6_V_hi_W(v_sums); + + HVX_Vector v_sum_sf_c0 = Q6_Vsf_equals_Vw(v_sum_c0); + HVX_Vector v_sum_sf_c1 = Q6_Vsf_equals_Vw(v_sum_c1); + + HVX_Vector v_scale_w = vptr[4]; + + __fp16 scale_a0_val = y0_scales[kt]; + __fp16 scale_a1_val = y1_scales[kt]; + HVX_Vector v_scale_a0 = hvx_vec_repl_f16(Q6_Vh_vsplat_R(*(const int16_t *)&scale_a0_val)); + HVX_Vector v_scale_a1 = hvx_vec_repl_f16(Q6_Vh_vsplat_R(*(const int16_t *)&scale_a1_val)); + + HVX_Vector v_scale_comb_c0 = hvx_vec_mul_f16_f16_to_f32_lower32(v_scale_w, v_scale_a0); + HVX_Vector v_scale_comb_c1 = hvx_vec_mul_f16_f16_to_f32_lower32(v_scale_w, v_scale_a1); + + HVX_Vector v_sum_scaled_c0 = hvx_vec_mul_f32_f32(v_sum_sf_c0, v_scale_comb_c0); + HVX_Vector v_sum_scaled_c1 = hvx_vec_mul_f32_f32(v_sum_sf_c1, v_scale_comb_c1); + + v_sum_float_c0 = hvx_vec_add_f32_f32(v_sum_float_c0, v_sum_scaled_c0); + v_sum_float_c1 = hvx_vec_add_f32_f32(v_sum_float_c1, v_sum_scaled_c1); + } + + hvx_vec_store_u(s0, valid_rows * sizeof(float), v_sum_float_c0); + hvx_vec_store_u(s1, valid_rows * sizeof(float), v_sum_float_c1); +} + +static void flat_vec_dot_q4_1_32x1(const uint32_t n, float * restrict s, const void * restrict vx, const void * restrict vy, uint32_t valid_rows) { + const uint8_t * restrict tile_ptr = vx; + const uint8_t * restrict y_q = vy; + + HVX_Vector v_sum_float = Q6_V_vzero(); + + static const uint8_t __attribute__((aligned(128))) repl[128] = { + 0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x20, 0x20, 0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x40, 0x40, 0x40, 0x40, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x20, 0x20, 0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + }; + HVX_Vector v_repl_ctrl = * (const HVX_Vector *) repl; + + const uint32_t quants_size = hex_round_up(n, 128); + const __fp16 * restrict y_scales = (const __fp16 *) (y_q + quants_size); + + uint32_t n_k_tiles = n / 32; + for (uint32_t kt = 0; kt < n_k_tiles; kt++) { + const HVX_Vector * restrict vptr = (const HVX_Vector *) (tile_ptr + kt * 640); + + uint32_t block_idx = kt / 4; + uint32_t sub_idx = kt % 4; + + HVX_Vector vx_i8 = * (const HVX_Vector *) (y_q + block_idx * 128); + HVX_Vector v_act_raw = Q6_V_vror_VR(vx_i8, sub_idx * 32); + + HVX_Vector v_act_rep[8]; + v_act_rep[0] = Q6_V_vdelta_VV(v_act_raw, v_repl_ctrl); + v_act_rep[1] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act_raw, 4), v_repl_ctrl); + v_act_rep[2] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act_raw, 8), v_repl_ctrl); + v_act_rep[3] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act_raw, 12), v_repl_ctrl); + v_act_rep[4] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act_raw, 16), v_repl_ctrl); + v_act_rep[5] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act_raw, 20), v_repl_ctrl); + v_act_rep[6] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act_raw, 24), v_repl_ctrl); + v_act_rep[7] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act_raw, 28), v_repl_ctrl); + + HVX_Vector v_sum = accum_4bit_32x1(vptr, v_act_rep, Q6_V_vzero()); + HVX_Vector v_sum_sf = Q6_Vsf_equals_Vw(v_sum); + + HVX_Vector v_scale_offset = vptr[4]; + HVX_VectorPair p_deal = Q6_W_vdeal_VVR(v_scale_offset, v_scale_offset, -2); + HVX_Vector v_scale = Q6_V_lo_W(p_deal); + HVX_Vector v_offset = Q6_V_hi_W(p_deal); + + __fp16 scale_a_val = y_scales[kt * 2 + 0]; + __fp16 sum_a_val = y_scales[kt * 2 + 1]; + HVX_Vector v_scale_a = hvx_vec_repl_f16(Q6_Vh_vsplat_R(*(const int16_t *)&scale_a_val)); + HVX_Vector v_sum_a = hvx_vec_repl_f16(Q6_Vh_vsplat_R(*(const int16_t *)&sum_a_val)); + + HVX_Vector v_scale_comb = hvx_vec_mul_f16_f16_to_f32_lower32(v_scale, v_scale_a); + HVX_Vector v_offset_comb = hvx_vec_mul_f16_f16_to_f32_lower32(v_offset, v_sum_a); + + HVX_Vector v_scaled_dot = hvx_vec_mul_f32_f32(v_sum_sf, v_scale_comb); + HVX_Vector v_sum_scaled = hvx_vec_add_f32_f32(v_scaled_dot, v_offset_comb); + + v_sum_float = hvx_vec_add_f32_f32(v_sum_float, v_sum_scaled); + } + + hvx_vec_store_u(s, valid_rows * sizeof(float), v_sum_float); +} + +static void flat_vec_dot_q4_1_32x2(const uint32_t n, float * restrict s0, float * restrict s1, const void * restrict vx, const void * restrict vy0, const void * restrict vy1, uint32_t valid_rows) { + const uint8_t * restrict tile_ptr = vx; + const uint8_t * restrict y0_q = vy0; + const uint8_t * restrict y1_q = vy1; + + HVX_Vector v_sum_float_c0 = Q6_V_vzero(); + HVX_Vector v_sum_float_c1 = Q6_V_vzero(); + + static const uint8_t __attribute__((aligned(128))) repl[128] = { + 0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x20, 0x20, 0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x40, 0x40, 0x40, 0x40, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x20, 0x20, 0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + }; + HVX_Vector v_repl_ctrl = * (const HVX_Vector *) repl; + + const uint32_t quants_size = hex_round_up(n, 128); + const __fp16 * restrict y0_scales = (const __fp16 *) (y0_q + quants_size); + const __fp16 * restrict y1_scales = (const __fp16 *) (y1_q + quants_size); + + uint32_t n_k_tiles = n / 32; + for (uint32_t kt = 0; kt < n_k_tiles; kt++) { + const HVX_Vector * restrict vptr = (const HVX_Vector *) (tile_ptr + kt * 640); + + uint32_t block_idx = kt / 4; + uint32_t sub_idx = kt % 4; + + HVX_Vector vx0_i8 = * (const HVX_Vector *) (y0_q + block_idx * 128); + HVX_Vector vx1_i8 = * (const HVX_Vector *) (y1_q + block_idx * 128); + + HVX_Vector v_act0_raw = Q6_V_vror_VR(vx0_i8, sub_idx * 32); + HVX_Vector v_act1_raw = Q6_V_vror_VR(vx1_i8, sub_idx * 32); + + HVX_Vector v_act0_rep[8]; + v_act0_rep[0] = Q6_V_vdelta_VV(v_act0_raw, v_repl_ctrl); + v_act0_rep[1] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act0_raw, 4), v_repl_ctrl); + v_act0_rep[2] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act0_raw, 8), v_repl_ctrl); + v_act0_rep[3] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act0_raw, 12), v_repl_ctrl); + v_act0_rep[4] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act0_raw, 16), v_repl_ctrl); + v_act0_rep[5] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act0_raw, 20), v_repl_ctrl); + v_act0_rep[6] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act0_raw, 24), v_repl_ctrl); + v_act0_rep[7] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act0_raw, 28), v_repl_ctrl); + + HVX_Vector v_act1_rep[8]; + v_act1_rep[0] = Q6_V_vdelta_VV(v_act1_raw, v_repl_ctrl); + v_act1_rep[1] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act1_raw, 4), v_repl_ctrl); + v_act1_rep[2] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act1_raw, 8), v_repl_ctrl); + v_act1_rep[3] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act1_raw, 12), v_repl_ctrl); + v_act1_rep[4] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act1_raw, 16), v_repl_ctrl); + v_act1_rep[5] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act1_raw, 20), v_repl_ctrl); + v_act1_rep[6] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act1_raw, 24), v_repl_ctrl); + v_act1_rep[7] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act1_raw, 28), v_repl_ctrl); + + HVX_VectorPair v_sums = accum_4bit_32x2(vptr, v_act0_rep, v_act1_rep, Q6_V_vzero()); + HVX_Vector v_sum_c0 = Q6_V_lo_W(v_sums); + HVX_Vector v_sum_c1 = Q6_V_hi_W(v_sums); + + HVX_Vector v_sum_sf_c0 = Q6_Vsf_equals_Vw(v_sum_c0); + HVX_Vector v_sum_sf_c1 = Q6_Vsf_equals_Vw(v_sum_c1); + + HVX_Vector v_scale_offset = vptr[4]; + HVX_VectorPair p_deal = Q6_W_vdeal_VVR(v_scale_offset, v_scale_offset, -2); + HVX_Vector v_scale = Q6_V_lo_W(p_deal); + HVX_Vector v_offset = Q6_V_hi_W(p_deal); + + __fp16 scale_a0_val = y0_scales[kt * 2 + 0]; + __fp16 sum_a0_val = y0_scales[kt * 2 + 1]; + __fp16 scale_a1_val = y1_scales[kt * 2 + 0]; + __fp16 sum_a1_val = y1_scales[kt * 2 + 1]; + + HVX_Vector v_scale_a0 = hvx_vec_repl_f16(Q6_Vh_vsplat_R(*(const int16_t *)&scale_a0_val)); + HVX_Vector v_sum_a0 = hvx_vec_repl_f16(Q6_Vh_vsplat_R(*(const int16_t *)&sum_a0_val)); + HVX_Vector v_scale_a1 = hvx_vec_repl_f16(Q6_Vh_vsplat_R(*(const int16_t *)&scale_a1_val)); + HVX_Vector v_sum_a1 = hvx_vec_repl_f16(Q6_Vh_vsplat_R(*(const int16_t *)&sum_a1_val)); + + HVX_Vector v_scale_comb_c0 = hvx_vec_mul_f16_f16_to_f32_lower32(v_scale, v_scale_a0); + HVX_Vector v_offset_comb_c0 = hvx_vec_mul_f16_f16_to_f32_lower32(v_offset, v_sum_a0); + HVX_Vector v_scale_comb_c1 = hvx_vec_mul_f16_f16_to_f32_lower32(v_scale, v_scale_a1); + HVX_Vector v_offset_comb_c1 = hvx_vec_mul_f16_f16_to_f32_lower32(v_offset, v_sum_a1); + + HVX_Vector v_scaled_dot_c0 = hvx_vec_mul_f32_f32(v_sum_sf_c0, v_scale_comb_c0); + HVX_Vector v_sum_scaled_c0 = hvx_vec_add_f32_f32(v_scaled_dot_c0, v_offset_comb_c0); + + HVX_Vector v_scaled_dot_c1 = hvx_vec_mul_f32_f32(v_sum_sf_c1, v_scale_comb_c1); + HVX_Vector v_sum_scaled_c1 = hvx_vec_add_f32_f32(v_scaled_dot_c1, v_offset_comb_c1); + + v_sum_float_c0 = hvx_vec_add_f32_f32(v_sum_float_c0, v_sum_scaled_c0); + v_sum_float_c1 = hvx_vec_add_f32_f32(v_sum_float_c1, v_sum_scaled_c1); + } + + hvx_vec_store_u(s0, valid_rows * sizeof(float), v_sum_float_c0); + hvx_vec_store_u(s1, valid_rows * sizeof(float), v_sum_float_c1); +} + +static void flat_vec_dot_q8_0_32x1(const uint32_t n, float * restrict s, const void * restrict vx, const void * restrict vy, uint32_t valid_rows) { + const uint8_t * restrict tile_ptr = vx; + const uint8_t * restrict y_q = vy; + + HVX_Vector v_sum_float = Q6_V_vzero(); + + static const uint8_t __attribute__((aligned(128))) repl[128] = { + 0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x20, 0x20, 0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x40, 0x40, 0x40, 0x40, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x20, 0x20, 0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + }; + HVX_Vector v_repl_ctrl = * (const HVX_Vector *) repl; + + const uint32_t quants_size = hex_round_up(n, 128); + const __fp16 * restrict y_scales = (const __fp16 *) (y_q + quants_size); + + uint32_t n_k_tiles = n / 32; + for (uint32_t kt = 0; kt < n_k_tiles; kt++) { + const HVX_Vector * restrict vptr = (const HVX_Vector *) (tile_ptr + kt * 1152); + + uint32_t block_idx = kt / 4; + uint32_t sub_idx = kt % 4; + + HVX_Vector vx_i8 = * (const HVX_Vector *) (y_q + block_idx * 128); + HVX_Vector v_act_raw = Q6_V_vror_VR(vx_i8, sub_idx * 32); + + HVX_Vector v_act_rep[8]; + v_act_rep[0] = Q6_V_vdelta_VV(v_act_raw, v_repl_ctrl); + v_act_rep[1] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act_raw, 4), v_repl_ctrl); + v_act_rep[2] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act_raw, 8), v_repl_ctrl); + v_act_rep[3] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act_raw, 12), v_repl_ctrl); + v_act_rep[4] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act_raw, 16), v_repl_ctrl); + v_act_rep[5] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act_raw, 20), v_repl_ctrl); + v_act_rep[6] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act_raw, 24), v_repl_ctrl); + v_act_rep[7] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act_raw, 28), v_repl_ctrl); + + HVX_Vector v_sum = accum_q8_0_32x1(vptr, v_act_rep); + HVX_Vector v_sum_sf = Q6_Vsf_equals_Vw(v_sum); + + HVX_Vector v_scale_w = vptr[8]; + + __fp16 scale_a_val = y_scales[kt]; + HVX_Vector v_scale_a = hvx_vec_repl_f16(Q6_Vh_vsplat_R(*(const int16_t *)&scale_a_val)); + + HVX_Vector v_scale_comb = hvx_vec_mul_f16_f16_to_f32_lower32(v_scale_w, v_scale_a); + HVX_Vector v_sum_scaled = hvx_vec_mul_f32_f32(v_sum_sf, v_scale_comb); + + v_sum_float = hvx_vec_add_f32_f32(v_sum_float, v_sum_scaled); + } + + hvx_vec_store_u(s, valid_rows * sizeof(float), v_sum_float); +} + +static void flat_vec_dot_q8_0_32x2(const uint32_t n, float * restrict s0, float * restrict s1, const void * restrict vx, const void * restrict vy0, const void * restrict vy1, uint32_t valid_rows) { + const uint8_t * restrict tile_ptr = vx; + const uint8_t * restrict y0_q = vy0; + const uint8_t * restrict y1_q = vy1; + + HVX_Vector v_sum_float_c0 = Q6_V_vzero(); + HVX_Vector v_sum_float_c1 = Q6_V_vzero(); + + static const uint8_t __attribute__((aligned(128))) repl[128] = { + 0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x20, 0x20, 0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x40, 0x40, 0x40, 0x40, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x20, 0x20, 0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + }; + HVX_Vector v_repl_ctrl = * (const HVX_Vector *) repl; + + const uint32_t quants_size = hex_round_up(n, 128); + const __fp16 * restrict y0_scales = (const __fp16 *) (y0_q + quants_size); + const __fp16 * restrict y1_scales = (const __fp16 *) (y1_q + quants_size); + + uint32_t n_k_tiles = n / 32; + for (uint32_t kt = 0; kt < n_k_tiles; kt++) { + const HVX_Vector * restrict vptr = (const HVX_Vector *) (tile_ptr + kt * 1152); + + uint32_t block_idx = kt / 4; + uint32_t sub_idx = kt % 4; + + HVX_Vector vx0_i8 = * (const HVX_Vector *) (y0_q + block_idx * 128); + HVX_Vector vx1_i8 = * (const HVX_Vector *) (y1_q + block_idx * 128); + + HVX_Vector v_act0_raw = Q6_V_vror_VR(vx0_i8, sub_idx * 32); + HVX_Vector v_act1_raw = Q6_V_vror_VR(vx1_i8, sub_idx * 32); + + HVX_Vector v_act0_rep[8]; + v_act0_rep[0] = Q6_V_vdelta_VV(v_act0_raw, v_repl_ctrl); + v_act0_rep[1] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act0_raw, 4), v_repl_ctrl); + v_act0_rep[2] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act0_raw, 8), v_repl_ctrl); + v_act0_rep[3] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act0_raw, 12), v_repl_ctrl); + v_act0_rep[4] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act0_raw, 16), v_repl_ctrl); + v_act0_rep[5] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act0_raw, 20), v_repl_ctrl); + v_act0_rep[6] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act0_raw, 24), v_repl_ctrl); + v_act0_rep[7] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act0_raw, 28), v_repl_ctrl); + + HVX_Vector v_act1_rep[8]; + v_act1_rep[0] = Q6_V_vdelta_VV(v_act1_raw, v_repl_ctrl); + v_act1_rep[1] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act1_raw, 4), v_repl_ctrl); + v_act1_rep[2] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act1_raw, 8), v_repl_ctrl); + v_act1_rep[3] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act1_raw, 12), v_repl_ctrl); + v_act1_rep[4] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act1_raw, 16), v_repl_ctrl); + v_act1_rep[5] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act1_raw, 20), v_repl_ctrl); + v_act1_rep[6] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act1_raw, 24), v_repl_ctrl); + v_act1_rep[7] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act1_raw, 28), v_repl_ctrl); + + HVX_VectorPair v_sums = accum_q8_0_32x2(vptr, v_act0_rep, v_act1_rep); + HVX_Vector v_sum_c0 = Q6_V_lo_W(v_sums); + HVX_Vector v_sum_c1 = Q6_V_hi_W(v_sums); + + HVX_Vector v_sum_sf_c0 = Q6_Vsf_equals_Vw(v_sum_c0); + HVX_Vector v_sum_sf_c1 = Q6_Vsf_equals_Vw(v_sum_c1); + + HVX_Vector v_scale_w = vptr[8]; + + __fp16 scale_a0_val = y0_scales[kt]; + __fp16 scale_a1_val = y1_scales[kt]; + HVX_Vector v_scale_a0 = hvx_vec_repl_f16(Q6_Vh_vsplat_R(*(const int16_t *)&scale_a0_val)); + HVX_Vector v_scale_a1 = hvx_vec_repl_f16(Q6_Vh_vsplat_R(*(const int16_t *)&scale_a1_val)); + + HVX_Vector v_scale_comb_c0 = hvx_vec_mul_f16_f16_to_f32_lower32(v_scale_w, v_scale_a0); + HVX_Vector v_scale_comb_c1 = hvx_vec_mul_f16_f16_to_f32_lower32(v_scale_w, v_scale_a1); + + HVX_Vector v_sum_scaled_c0 = hvx_vec_mul_f32_f32(v_sum_sf_c0, v_scale_comb_c0); + HVX_Vector v_sum_scaled_c1 = hvx_vec_mul_f32_f32(v_sum_sf_c1, v_scale_comb_c1); + + v_sum_float_c0 = hvx_vec_add_f32_f32(v_sum_float_c0, v_sum_scaled_c0); + v_sum_float_c1 = hvx_vec_add_f32_f32(v_sum_float_c1, v_sum_scaled_c1); + } + + hvx_vec_store_u(s0, valid_rows * sizeof(float), v_sum_float_c0); + hvx_vec_store_u(s1, valid_rows * sizeof(float), v_sum_float_c1); +} + +static void flat_vec_dot_iq4nl_32x1(const uint32_t n, float * restrict s, const void * restrict vx, const void * restrict vy, uint32_t valid_rows) { + const uint8_t * restrict tile_ptr = vx; + const uint8_t * restrict y_q = vy; + + HVX_Vector v_sum_float = Q6_V_vzero(); + HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F); + HVX_Vector lut = *(const HVX_Vector *) kvalues_iq4nl_lut; + + static const uint8_t __attribute__((aligned(128))) repl[128] = { + 0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x20, 0x20, 0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x40, 0x40, 0x40, 0x40, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x20, 0x20, 0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + }; + HVX_Vector v_repl_ctrl = * (const HVX_Vector *) repl; + + const uint32_t quants_size = hex_round_up(n, 128); + const __fp16 * restrict y_scales = (const __fp16 *) (y_q + quants_size); + + uint32_t n_k_tiles = n / 32; + for (uint32_t kt = 0; kt < n_k_tiles; kt++) { + const HVX_Vector * restrict vptr = (const HVX_Vector *) (tile_ptr + kt * 640); + + uint32_t block_idx = kt / 4; + uint32_t sub_idx = kt % 4; + + HVX_Vector vx = * (const HVX_Vector *) (y_q + block_idx * 128); + HVX_Vector v_act_raw = Q6_V_vror_VR(vx, sub_idx * 32); + + HVX_Vector v_act_rep[8]; + v_act_rep[0] = Q6_V_vdelta_VV(v_act_raw, v_repl_ctrl); + v_act_rep[1] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act_raw, 4), v_repl_ctrl); + v_act_rep[2] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act_raw, 8), v_repl_ctrl); + v_act_rep[3] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act_raw, 12), v_repl_ctrl); + v_act_rep[4] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act_raw, 16), v_repl_ctrl); + v_act_rep[5] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act_raw, 20), v_repl_ctrl); + v_act_rep[6] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act_raw, 24), v_repl_ctrl); + v_act_rep[7] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act_raw, 28), v_repl_ctrl); + + HVX_Vector v_sum = accum_4bit_32x1_lut(vptr, v_act_rep, mask_h4, lut); + HVX_Vector v_sum_sf = Q6_Vsf_equals_Vw(v_sum); + + HVX_Vector v_scale_w = vptr[4]; + + __fp16 scale_a_val = y_scales[kt]; + HVX_Vector v_scale_a = hvx_vec_repl_f16(Q6_Vh_vsplat_R(*(const int16_t *)&scale_a_val)); + + HVX_Vector v_scale_comb = hvx_vec_mul_f16_f16_to_f32_lower32(v_scale_w, v_scale_a); + HVX_Vector v_sum_scaled = hvx_vec_mul_f32_f32(v_sum_sf, v_scale_comb); + + v_sum_float = hvx_vec_add_f32_f32(v_sum_float, v_sum_scaled); + } + + hvx_vec_store_u(s, valid_rows * sizeof(float), v_sum_float); +} + +static void flat_vec_dot_iq4nl_32x2(const uint32_t n, float * restrict s0, float * restrict s1, const void * restrict vx, const void * restrict vy0, const void * restrict vy1, uint32_t valid_rows) { + const uint8_t * restrict tile_ptr = vx; + const uint8_t * restrict y0_q = vy0; + const uint8_t * restrict y1_q = vy1; + + HVX_Vector v_sum_float_c0 = Q6_V_vzero(); + HVX_Vector v_sum_float_c1 = Q6_V_vzero(); + HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F); + HVX_Vector lut = *(const HVX_Vector *) kvalues_iq4nl_lut; + + static const uint8_t __attribute__((aligned(128))) repl[128] = { + 0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x20, 0x20, 0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x40, 0x40, 0x40, 0x40, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x20, 0x20, 0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + }; + HVX_Vector v_repl_ctrl = * (const HVX_Vector *) repl; + + const uint32_t quants_size = hex_round_up(n, 128); + const __fp16 * restrict y0_scales = (const __fp16 *) (y0_q + quants_size); + const __fp16 * restrict y1_scales = (const __fp16 *) (y1_q + quants_size); + + uint32_t n_k_tiles = n / 32; + for (uint32_t kt = 0; kt < n_k_tiles; kt++) { + const HVX_Vector * restrict vptr = (const HVX_Vector *) (tile_ptr + kt * 640); + + uint32_t block_idx = kt / 4; + uint32_t sub_idx = kt % 4; + + HVX_Vector vx0 = * (const HVX_Vector *) (y0_q + block_idx * 128); + HVX_Vector vx1 = * (const HVX_Vector *) (y1_q + block_idx * 128); + + HVX_Vector v_act0_raw = Q6_V_vror_VR(vx0, sub_idx * 32); + HVX_Vector v_act1_raw = Q6_V_vror_VR(vx1, sub_idx * 32); + + HVX_Vector v_act0_rep[8]; + v_act0_rep[0] = Q6_V_vdelta_VV(v_act0_raw, v_repl_ctrl); + v_act0_rep[1] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act0_raw, 4), v_repl_ctrl); + v_act0_rep[2] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act0_raw, 8), v_repl_ctrl); + v_act0_rep[3] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act0_raw, 12), v_repl_ctrl); + v_act0_rep[4] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act0_raw, 16), v_repl_ctrl); + v_act0_rep[5] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act0_raw, 20), v_repl_ctrl); + v_act0_rep[6] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act0_raw, 24), v_repl_ctrl); + v_act0_rep[7] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act0_raw, 28), v_repl_ctrl); + + HVX_Vector v_act1_rep[8]; + v_act1_rep[0] = Q6_V_vdelta_VV(v_act1_raw, v_repl_ctrl); + v_act1_rep[1] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act1_raw, 4), v_repl_ctrl); + v_act1_rep[2] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act1_raw, 8), v_repl_ctrl); + v_act1_rep[3] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act1_raw, 12), v_repl_ctrl); + v_act1_rep[4] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act1_raw, 16), v_repl_ctrl); + v_act1_rep[5] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act1_raw, 20), v_repl_ctrl); + v_act1_rep[6] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act1_raw, 24), v_repl_ctrl); + v_act1_rep[7] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act1_raw, 28), v_repl_ctrl); + + HVX_VectorPair v_sums = accum_4bit_32x2_lut(vptr, v_act0_rep, v_act1_rep, mask_h4, lut); + HVX_Vector v_sum_c0 = Q6_V_lo_W(v_sums); + HVX_Vector v_sum_c1 = Q6_V_hi_W(v_sums); + + HVX_Vector v_sum_sf_c0 = Q6_Vsf_equals_Vw(v_sum_c0); + HVX_Vector v_sum_sf_c1 = Q6_Vsf_equals_Vw(v_sum_c1); + + HVX_Vector v_scale_w = vptr[4]; + + __fp16 scale_a0_val = y0_scales[kt]; + __fp16 scale_a1_val = y1_scales[kt]; + HVX_Vector v_scale_a0 = hvx_vec_repl_f16(Q6_Vh_vsplat_R(*(const int16_t *)&scale_a0_val)); + HVX_Vector v_scale_a1 = hvx_vec_repl_f16(Q6_Vh_vsplat_R(*(const int16_t *)&scale_a1_val)); + + HVX_Vector v_scale_comb_c0 = hvx_vec_mul_f16_f16_to_f32_lower32(v_scale_w, v_scale_a0); + HVX_Vector v_scale_comb_c1 = hvx_vec_mul_f16_f16_to_f32_lower32(v_scale_w, v_scale_a1); + + HVX_Vector v_sum_scaled_c0 = hvx_vec_mul_f32_f32(v_sum_sf_c0, v_scale_comb_c0); + HVX_Vector v_sum_scaled_c1 = hvx_vec_mul_f32_f32(v_sum_sf_c1, v_scale_comb_c1); + + v_sum_float_c0 = hvx_vec_add_f32_f32(v_sum_float_c0, v_sum_scaled_c0); + v_sum_float_c1 = hvx_vec_add_f32_f32(v_sum_float_c1, v_sum_scaled_c1); + } + + hvx_vec_store_u(s0, valid_rows * sizeof(float), v_sum_float_c0); + hvx_vec_store_u(s1, valid_rows * sizeof(float), v_sum_float_c1); +} + +static void flat_vec_dot_mxfp4_32x1(const uint32_t n, float * restrict s, const void * restrict vx, const void * restrict vy, uint32_t valid_rows) { + const uint8_t * restrict tile_ptr = vx; + const uint8_t * restrict y_q = vy; + + HVX_Vector v_sum_float = Q6_V_vzero(); + HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F); + HVX_Vector lut = *(const HVX_Vector *) kvalues_mxfp4_lut; + HVX_Vector expand = *(const HVX_Vector *) expand_x32_e8m0; + HVX_Vector e8m0_mask = Q6_V_vsplat_R(0x000000ff); + + static const uint8_t __attribute__((aligned(128))) repl[128] = { + 0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x20, 0x20, 0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x40, 0x40, 0x40, 0x40, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x20, 0x20, 0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + }; + HVX_Vector v_repl_ctrl = * (const HVX_Vector *) repl; + + const uint32_t quants_size = hex_round_up(n, 128); + const __fp16 * restrict y_scales = (const __fp16 *) (y_q + quants_size); + + uint32_t n_k_tiles = n / 32; + for (uint32_t kt = 0; kt < n_k_tiles; kt++) { + const HVX_Vector * restrict vptr = (const HVX_Vector *) (tile_ptr + kt * 640); + + uint32_t block_idx = kt / 4; + uint32_t sub_idx = kt % 4; + + HVX_Vector vx = * (const HVX_Vector *) (y_q + block_idx * 128); + HVX_Vector v_act_raw = Q6_V_vror_VR(vx, sub_idx * 32); + + HVX_Vector v_act_rep[8]; + v_act_rep[0] = Q6_V_vdelta_VV(v_act_raw, v_repl_ctrl); + v_act_rep[1] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act_raw, 4), v_repl_ctrl); + v_act_rep[2] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act_raw, 8), v_repl_ctrl); + v_act_rep[3] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act_raw, 12), v_repl_ctrl); + v_act_rep[4] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act_raw, 16), v_repl_ctrl); + v_act_rep[5] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act_raw, 20), v_repl_ctrl); + v_act_rep[6] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act_raw, 24), v_repl_ctrl); + v_act_rep[7] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act_raw, 28), v_repl_ctrl); + + HVX_Vector v_sum = accum_4bit_32x1_lut(vptr, v_act_rep, mask_h4, lut); + HVX_Vector v_sum_sf = Q6_Vsf_equals_Vw(v_sum); + + HVX_Vector v_scale_w = hvx_vmem(tile_ptr + kt * 640 + 512); + HVX_Vector r0_d = Q6_V_vdelta_VV(v_scale_w, expand); + r0_d = Q6_V_vand_VV(r0_d, e8m0_mask); + HVX_Vector v_scale_w_f32 = Q6_Vw_vasl_VwR(r0_d, 23); + + __fp16 scale_a_val = y_scales[kt]; + HVX_Vector v_scale_a_f16 = hvx_vec_repl_f16(Q6_Vh_vsplat_R(*(const int16_t *)&scale_a_val)); + HVX_VectorPair p_scale_a_f32 = hvx_vec_f16_to_f32(v_scale_a_f16); + HVX_Vector v_scale_a = Q6_V_lo_W(p_scale_a_f32); + + HVX_Vector v_scale_comb = hvx_vec_mul_f32_f32(v_scale_w_f32, v_scale_a); + HVX_Vector v_sum_scaled = hvx_vec_mul_f32_f32(v_sum_sf, v_scale_comb); + + v_sum_float = hvx_vec_add_f32_f32(v_sum_float, v_sum_scaled); + } + + v_sum_float = hvx_vec_mul_f32_f32(v_sum_float, hvx_vec_splat_f32(0.5f)); + + hvx_vec_store_u(s, valid_rows * sizeof(float), v_sum_float); +} + +static void flat_vec_dot_mxfp4_32x2(const uint32_t n, float * restrict s0, float * restrict s1, const void * restrict vx, const void * restrict vy0, const void * restrict vy1, uint32_t valid_rows) { + const uint8_t * restrict tile_ptr = vx; + const uint8_t * restrict y0_q = vy0; + const uint8_t * restrict y1_q = vy1; + + HVX_Vector v_sum_float_c0 = Q6_V_vzero(); + HVX_Vector v_sum_float_c1 = Q6_V_vzero(); + HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F); + HVX_Vector lut = *(const HVX_Vector *) kvalues_mxfp4_lut; + HVX_Vector expand = *(const HVX_Vector *) expand_x32_e8m0; + HVX_Vector e8m0_mask = Q6_V_vsplat_R(0x000000ff); + + static const uint8_t __attribute__((aligned(128))) repl[128] = { + 0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x20, 0x20, 0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x40, 0x40, 0x40, 0x40, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x20, 0x20, 0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + }; + HVX_Vector v_repl_ctrl = * (const HVX_Vector *) repl; + + const uint32_t quants_size = hex_round_up(n, 128); + const __fp16 * restrict y0_scales = (const __fp16 *) (y0_q + quants_size); + const __fp16 * restrict y1_scales = (const __fp16 *) (y1_q + quants_size); + + uint32_t n_k_tiles = n / 32; + for (uint32_t kt = 0; kt < n_k_tiles; kt++) { + const HVX_Vector * restrict vptr = (const HVX_Vector *) (tile_ptr + kt * 640); + + uint32_t block_idx = kt / 4; + uint32_t sub_idx = kt % 4; + + HVX_Vector vx0 = * (const HVX_Vector *) (y0_q + block_idx * 128); + HVX_Vector vx1 = * (const HVX_Vector *) (y1_q + block_idx * 128); + + HVX_Vector v_act0_raw = Q6_V_vror_VR(vx0, sub_idx * 32); + HVX_Vector v_act1_raw = Q6_V_vror_VR(vx1, sub_idx * 32); + + HVX_Vector v_act0_rep[8]; + v_act0_rep[0] = Q6_V_vdelta_VV(v_act0_raw, v_repl_ctrl); + v_act0_rep[1] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act0_raw, 4), v_repl_ctrl); + v_act0_rep[2] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act0_raw, 8), v_repl_ctrl); + v_act0_rep[3] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act0_raw, 12), v_repl_ctrl); + v_act0_rep[4] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act0_raw, 16), v_repl_ctrl); + v_act0_rep[5] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act0_raw, 20), v_repl_ctrl); + v_act0_rep[6] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act0_raw, 24), v_repl_ctrl); + v_act0_rep[7] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act0_raw, 28), v_repl_ctrl); + + HVX_Vector v_act1_rep[8]; + v_act1_rep[0] = Q6_V_vdelta_VV(v_act1_raw, v_repl_ctrl); + v_act1_rep[1] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act1_raw, 4), v_repl_ctrl); + v_act1_rep[2] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act1_raw, 8), v_repl_ctrl); + v_act1_rep[3] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act1_raw, 12), v_repl_ctrl); + v_act1_rep[4] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act1_raw, 16), v_repl_ctrl); + v_act1_rep[5] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act1_raw, 20), v_repl_ctrl); + v_act1_rep[6] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act1_raw, 24), v_repl_ctrl); + v_act1_rep[7] = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act1_raw, 28), v_repl_ctrl); + + HVX_VectorPair v_sums = accum_4bit_32x2_lut(vptr, v_act0_rep, v_act1_rep, mask_h4, lut); + HVX_Vector v_sum_c0 = Q6_V_lo_W(v_sums); + HVX_Vector v_sum_c1 = Q6_V_hi_W(v_sums); + + HVX_Vector v_sum_sf_c0 = Q6_Vsf_equals_Vw(v_sum_c0); + HVX_Vector v_sum_sf_c1 = Q6_Vsf_equals_Vw(v_sum_c1); + + HVX_Vector v_scale_w = hvx_vmem(tile_ptr + kt * 640 + 512); + HVX_Vector r0_d = Q6_V_vdelta_VV(v_scale_w, expand); + r0_d = Q6_V_vand_VV(r0_d, e8m0_mask); + HVX_Vector v_scale_w_f32 = Q6_Vw_vasl_VwR(r0_d, 23); + + __fp16 scale_a0_val = y0_scales[kt]; + __fp16 scale_a1_val = y1_scales[kt]; + HVX_Vector v_scale_a0_f16 = hvx_vec_repl_f16(Q6_Vh_vsplat_R(*(const int16_t *)&scale_a0_val)); + HVX_Vector v_scale_a1_f16 = hvx_vec_repl_f16(Q6_Vh_vsplat_R(*(const int16_t *)&scale_a1_val)); + HVX_VectorPair p_scale_a0_f32 = hvx_vec_f16_to_f32(v_scale_a0_f16); + HVX_VectorPair p_scale_a1_f32 = hvx_vec_f16_to_f32(v_scale_a1_f16); + HVX_Vector v_scale_a0 = Q6_V_lo_W(p_scale_a0_f32); + HVX_Vector v_scale_a1 = Q6_V_lo_W(p_scale_a1_f32); + + HVX_Vector v_scale_comb_c0 = hvx_vec_mul_f32_f32(v_scale_w_f32, v_scale_a0); + HVX_Vector v_scale_comb_c1 = hvx_vec_mul_f32_f32(v_scale_w_f32, v_scale_a1); + + HVX_Vector v_sum_scaled_c0 = hvx_vec_mul_f32_f32(v_sum_sf_c0, v_scale_comb_c0); + HVX_Vector v_sum_scaled_c1 = hvx_vec_mul_f32_f32(v_sum_sf_c1, v_scale_comb_c1); + + v_sum_float_c0 = hvx_vec_add_f32_f32(v_sum_float_c0, v_sum_scaled_c0); + v_sum_float_c1 = hvx_vec_add_f32_f32(v_sum_float_c1, v_sum_scaled_c1); + } + + v_sum_float_c0 = hvx_vec_mul_f32_f32(v_sum_float_c0, hvx_vec_splat_f32(0.5f)); + v_sum_float_c1 = hvx_vec_mul_f32_f32(v_sum_float_c1, hvx_vec_splat_f32(0.5f)); + + hvx_vec_store_u(s0, valid_rows * sizeof(float), v_sum_float_c0); + hvx_vec_store_u(s1, valid_rows * sizeof(float), v_sum_float_c1); +} diff --git a/ggml/src/ggml-hexagon/htp/hvx-mm-kernels-tiled.h b/ggml/src/ggml-hexagon/htp/hvx-mm-kernels-tiled.h new file mode 100644 index 000000000000..bcb0b8f9e474 --- /dev/null +++ b/ggml/src/ggml-hexagon/htp/hvx-mm-kernels-tiled.h @@ -0,0 +1,1140 @@ +// Dynamic quantizers that produce tiled activations + +static inline void quantize_block_f32_q8_1_tiled(float * restrict x, uint8_t * restrict y_block) { + assert((unsigned long) x % 128 == 0); + assert((unsigned long) y_block % 128 == 0); + + HVX_Vector * vx = (HVX_Vector *) x; + HVX_Vector zero = Q6_V_vzero(); + + HVX_Vector vmax0_sf = hvx_vec_reduce_max_f32(hvx_vec_abs_f32(vx[0])); + HVX_Vector vmax1_sf = hvx_vec_reduce_max_f32(hvx_vec_abs_f32(vx[1])); + HVX_Vector vmax2_sf = hvx_vec_reduce_max_f32(hvx_vec_abs_f32(vx[2])); + HVX_Vector vmax3_sf = hvx_vec_reduce_max_f32(hvx_vec_abs_f32(vx[3])); + + HVX_Vector vx0_qf = Q6_Vqf32_vsub_VsfVsf(vx[0], zero); + HVX_Vector vx1_qf = Q6_Vqf32_vsub_VsfVsf(vx[1], zero); + HVX_Vector vx2_qf = Q6_Vqf32_vsub_VsfVsf(vx[2], zero); + HVX_Vector vx3_qf = Q6_Vqf32_vsub_VsfVsf(vx[3], zero); + + HVX_Vector vmax0_qf = Q6_Vqf32_vsub_VsfVsf(vmax0_sf, zero); + HVX_Vector vmax1_qf = Q6_Vqf32_vsub_VsfVsf(vmax1_sf, zero); + HVX_Vector vmax2_qf = Q6_Vqf32_vsub_VsfVsf(vmax2_sf, zero); + HVX_Vector vmax3_qf = Q6_Vqf32_vsub_VsfVsf(vmax3_sf, zero); + + HVX_Vector vmax01_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vmax1_qf, vmax0_qf))); + HVX_Vector vmax23_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vmax3_qf, vmax2_qf))); + + HVX_Vector vx01_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vx1_qf, vx0_qf))); + HVX_Vector vx23_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vx3_qf, vx2_qf))); + + HVX_Vector vd01_qf16 = Q6_Vqf16_vmpy_VhfVhf(vmax01_hf, Q6_Vh_vsplat_R(0x2008)); // 1.0 / 127.0 + HVX_Vector vd23_qf16 = Q6_Vqf16_vmpy_VhfVhf(vmax23_hf, Q6_Vh_vsplat_R(0x2008)); // 1.0 / 127.0 + HVX_Vector vd01_hf = Q6_Vhf_equals_Vqf16(vd01_qf16); + HVX_Vector vd23_hf = Q6_Vhf_equals_Vqf16(vd23_qf16); + + HVX_Vector vd01_inv_hf = hvx_vec_inverse_f16(vd01_hf); + HVX_Vector vd23_inv_hf = hvx_vec_inverse_f16(vd23_hf); + vx01_hf = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(vx01_hf, vd01_inv_hf)); + vx23_hf = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(vx23_hf, vd23_inv_hf)); + + HVX_Vector vx01_i16 = hvx_vec_i16_from_hf_rnd_sat(vx01_hf); + HVX_Vector vx23_i16 = hvx_vec_i16_from_hf_rnd_sat(vx23_hf); + HVX_Vector vx_i8 = Q6_Vb_vpack_VhVh_sat(vx23_i16, vx01_i16); + + const HVX_Vector ones = Q6_Vb_vsplat_R(1); + HVX_Vector v_sums = Q6_Vw_vrmpy_VbVb(vx_i8, ones); + v_sums = Q6_Vw_vadd_VwVw(v_sums, Q6_V_vror_VR(v_sums, 4)); + v_sums = Q6_Vw_vadd_VwVw(v_sums, Q6_V_vror_VR(v_sums, 8)); + v_sums = Q6_Vw_vadd_VwVw(v_sums, Q6_V_vror_VR(v_sums, 16)); + + float vmax0[32] __attribute__((aligned(128))); + float vmax1[32] __attribute__((aligned(128))); + float vmax2[32] __attribute__((aligned(128))); + float vmax3[32] __attribute__((aligned(128))); + int32_t sums[32] __attribute__((aligned(128))); + + hvx_vec_store_u(vmax0, 128, vmax0_sf); + hvx_vec_store_u(vmax1, 128, vmax1_sf); + hvx_vec_store_u(vmax2, 128, vmax2_sf); + hvx_vec_store_u(vmax3, 128, vmax3_sf); + hvx_vec_store_u(sums, 128, v_sums); + + float d0 = vmax0[0] / 127.0f; + float d1 = vmax1[0] / 127.0f; + float d2 = vmax2[0] / 127.0f; + float d3 = vmax3[0] / 127.0f; + + static const uint8_t __attribute__((aligned(128))) repl[128] = { + 0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x20, 0x20, 0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x40, 0x40, 0x40, 0x40, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x20, 0x20, 0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + }; + HVX_Vector v_repl_ctrl = * (const HVX_Vector *) repl; + + for (int b = 0; b < 4; b++) { + HVX_Vector v_act = Q6_V_vror_VR(vx_i8, b * 32); + + HVX_Vector r0 = Q6_V_vdelta_VV(v_act, v_repl_ctrl); + HVX_Vector r1 = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act, 4), v_repl_ctrl); + HVX_Vector r2 = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act, 8), v_repl_ctrl); + HVX_Vector r3 = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act, 12), v_repl_ctrl); + HVX_Vector r4 = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act, 16), v_repl_ctrl); + HVX_Vector r5 = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act, 20), v_repl_ctrl); + HVX_Vector r6 = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act, 24), v_repl_ctrl); + HVX_Vector r7 = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act, 28), v_repl_ctrl); + + __fp16 scale_h, offset_h; + if (b == 0) { + scale_h = (__fp16) d0; + offset_h = (__fp16) (sums[0] * d0); + } else if (b == 1) { + scale_h = (__fp16) d1; + offset_h = (__fp16) (sums[8] * d1); + } else if (b == 2) { + scale_h = (__fp16) d2; + offset_h = (__fp16) (sums[16] * d2); + } else { + scale_h = (__fp16) d3; + offset_h = (__fp16) (sums[24] * d3); + } + + HVX_Vector r_scale = Q6_Vh_vsplat_R(*(int16_t *)&scale_h); + HVX_Vector r_offset = Q6_Vh_vsplat_R(*(int16_t *)&offset_h); + + HVX_Vector * restrict dst = (HVX_Vector *) (y_block + b * 1280); + dst[0] = r0; + dst[1] = r1; + dst[2] = r2; + dst[3] = r3; + dst[4] = r4; + dst[5] = r5; + dst[6] = r6; + dst[7] = r7; + dst[8] = r_scale; + dst[9] = r_offset; + } +} + +static inline void quantize_block_f32_q8_0_tiled(float * restrict x, uint8_t * restrict y_block) { + assert((unsigned long) x % 128 == 0); + assert((unsigned long) y_block % 128 == 0); + + HVX_Vector * vx = (HVX_Vector *) x; + HVX_Vector zero = Q6_V_vzero(); + + HVX_Vector vx0_qf = Q6_Vqf32_vsub_VsfVsf(vx[0], zero); + HVX_Vector vx1_qf = Q6_Vqf32_vsub_VsfVsf(vx[1], zero); + HVX_Vector vx2_qf = Q6_Vqf32_vsub_VsfVsf(vx[2], zero); + HVX_Vector vx3_qf = Q6_Vqf32_vsub_VsfVsf(vx[3], zero); + + HVX_Vector vx01_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vx1_qf, vx0_qf))); + HVX_Vector vx23_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vx3_qf, vx2_qf))); + + HVX_Vector vmax_hf = hvx_vec_reduce_max_f16(hvx_vec_abs_f16(vx01_hf)); + vmax_hf = hvx_vec_reduce_max2_f16(hvx_vec_abs_f16(vx23_hf), vmax_hf); + + HVX_Vector vd_qf16 = Q6_Vqf16_vmpy_VhfVhf(vmax_hf, Q6_Vh_vsplat_R(0x2008)); + HVX_Vector vd_hf = Q6_Vhf_equals_Vqf16(vd_qf16); + + HVX_Vector vd_inv_hf = hvx_vec_inverse_f16(vd_hf); + vx01_hf = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(vx01_hf, vd_inv_hf)); + vx23_hf = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(vx23_hf, vd_inv_hf)); + + HVX_Vector vx01_i16 = hvx_vec_i16_from_hf_rnd_sat(vx01_hf); + HVX_Vector vx23_i16 = hvx_vec_i16_from_hf_rnd_sat(vx23_hf); + HVX_Vector vx_i8 = Q6_Vb_vpack_VhVh_sat(vx23_i16, vx01_i16); + + HVX_Vector r_scale = hvx_vec_repl_f16(vd_hf); + + static const uint8_t __attribute__((aligned(128))) repl[128] = { + 0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x20, 0x20, 0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x40, 0x40, 0x40, 0x40, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x20, 0x20, 0x20, 0x20, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + 0x10, 0x10, 0x10, 0x10, 0x04, 0x04, 0x04, 0x04, 0x08, 0x08, 0x08, 0x08, 0x04, 0x04, 0x04, 0x04, + }; + HVX_Vector v_repl_ctrl = * (const HVX_Vector *) repl; + + for (int b = 0; b < 4; b++) { + HVX_Vector v_act = Q6_V_vror_VR(vx_i8, b * 32); + + HVX_Vector r0 = Q6_V_vdelta_VV(v_act, v_repl_ctrl); + HVX_Vector r1 = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act, 4), v_repl_ctrl); + HVX_Vector r2 = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act, 8), v_repl_ctrl); + HVX_Vector r3 = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act, 12), v_repl_ctrl); + HVX_Vector r4 = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act, 16), v_repl_ctrl); + HVX_Vector r5 = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act, 20), v_repl_ctrl); + HVX_Vector r6 = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act, 24), v_repl_ctrl); + HVX_Vector r7 = Q6_V_vdelta_VV(Q6_V_vror_VR(v_act, 28), v_repl_ctrl); + + HVX_Vector * restrict dst = (HVX_Vector *) (y_block + b * 1152); + dst[0] = r0; + dst[1] = r1; + dst[2] = r2; + dst[3] = r3; + dst[4] = r4; + dst[5] = r5; + dst[6] = r6; + dst[7] = r7; + dst[8] = r_scale; + } +} + +static void quantize_row_f32_q8_0_tiled(float * restrict x, uint8_t * restrict y, uint32_t k) { + assert(k % 32 == 0); + const uint32_t qk = QK_Q8_0_TILED; + const uint32_t nb = (k + qk - 1) / qk; + + for (uint32_t i = 0; i < nb; i++) { + uint8_t * restrict y_block = y + i * 4 * 1152; + quantize_block_f32_q8_0_tiled(x + i * qk, y_block); + } +} + +static void quantize_row_f32_q8_1_tiled(float * restrict x, uint8_t * restrict y, uint32_t k) { + assert(k % 32 == 0); + const uint32_t qk = QK_Q8_0_TILED; + const uint32_t nb = (k + qk - 1) / qk; + + for (uint32_t i = 0; i < nb; i++) { + uint8_t * restrict y_block = y + i * 4 * 1280; + quantize_block_f32_q8_1_tiled(x + i * qk, y_block); + } +} + +// Dot kernels & helpers that consume tiled activations + +static inline HVX_Vector hvx_vec_mul_f16_f16_to_f32_lower32(HVX_Vector v1, HVX_Vector v2) { +#if __HVX_ARCH__ >= 79 + HVX_VectorPair p = Q6_Wsf_vmpy_VhfVhf(v1, v2); + return Q6_V_lo_W(Q6_W_vshuff_VVR(Q6_V_hi_W(p), Q6_V_lo_W(p), -4)); +#else + HVX_VectorPair p = Q6_Wqf32_vmpy_VhfVhf(v1, v2); + HVX_Vector hi = Q6_Vsf_equals_Vqf32(Q6_V_hi_W(p)); + HVX_Vector lo = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(p)); + return Q6_V_lo_W(Q6_W_vshuff_VVR(hi, lo, -4)); +#endif +} + +static inline HVX_Vector unpack_and_interleave_4bit(HVX_Vector v_a, HVX_Vector v_b, HVX_Vector mask_h4) { + HVX_Vector v_W0 = Q6_V_vand_VV(v_a, mask_h4); + HVX_Vector v_W1 = Q6_Vub_vlsr_VubR(v_a, 4); + HVX_Vector v_W2 = Q6_V_vand_VV(v_b, mask_h4); + HVX_Vector v_W3 = Q6_Vub_vlsr_VubR(v_b, 4); + + HVX_VectorPair v01_pair = Q6_W_vshuff_VVR(v_W1, v_W0, -1); + HVX_VectorPair v23_pair = Q6_W_vshuff_VVR(v_W3, v_W2, -1); + HVX_VectorPair v0123_pair = Q6_W_vshuff_VVR(Q6_V_lo_W(v23_pair), Q6_V_lo_W(v01_pair), -2); + return Q6_V_lo_W(v0123_pair); +} + +static inline HVX_VectorPair unpack_and_interleave_4bit_x2(HVX_Vector v_src, HVX_Vector mask_h4) { + HVX_Vector v_lo = Q6_V_vand_VV(v_src, mask_h4); + HVX_Vector v_hi = Q6_Vub_vlsr_VubR(v_src, 4); + HVX_VectorPair v01_pair = Q6_W_vshuff_VVR(v_hi, v_lo, -1); + HVX_Vector v01_lo = Q6_V_lo_W(v01_pair); + HVX_Vector v01_hi = Q6_V_hi_W(v01_pair); + + HVX_Vector v23_lo = Q6_V_valign_VVR(v01_hi, v01_lo, 64); + HVX_Vector v_W0 = Q6_V_lo_W(Q6_W_vshuff_VVR(v23_lo, v01_lo, -2)); + + HVX_Vector v67_lo = Q6_V_valign_VVR(v01_lo, v01_hi, 64); + HVX_Vector v_W1 = Q6_V_lo_W(Q6_W_vshuff_VVR(v67_lo, v01_hi, -2)); + + return Q6_W_vcombine_VV(v_W1, v_W0); +} + +static inline HVX_Vector accum_4bit_32x1( + const HVX_Vector * restrict vptr, + const HVX_Vector * restrict v_act, + HVX_Vector i8 +) { + HVX_Vector v_sum0 = Q6_V_vzero(); + HVX_Vector v_sum1 = Q6_V_vzero(); + HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F); + + #pragma unroll + for (int i = 0; i < 4; i++) { + HVX_VectorPair v_W_pair = unpack_and_interleave_4bit_x2(vptr[i], mask_h4); + HVX_Vector v_W0 = Q6_Vb_vsub_VbVb(Q6_V_lo_W(v_W_pair), i8); + HVX_Vector v_W1 = Q6_Vb_vsub_VbVb(Q6_V_hi_W(v_W_pair), i8); + v_sum0 = Q6_Vw_vrmpyacc_VwVbVb(v_sum0, v_W0, v_act[i * 2 + 0]); + v_sum1 = Q6_Vw_vrmpyacc_VwVbVb(v_sum1, v_W1, v_act[i * 2 + 1]); + } + + return Q6_Vw_vadd_VwVw(v_sum0, v_sum1); +} + +static inline HVX_Vector accum_4bit_32x1_lut( + const HVX_Vector * restrict vptr, + const HVX_Vector * restrict v_act, + HVX_Vector mask_h4, + HVX_Vector lut +) { + HVX_Vector v_sum0 = Q6_V_vzero(); + HVX_Vector v_sum1 = Q6_V_vzero(); + + #pragma unroll + for (int i = 0; i < 4; i++) { + HVX_VectorPair v_W_pair = unpack_and_interleave_4bit_x2(vptr[i], mask_h4); + HVX_Vector v_W0 = Q6_Vb_vlut32_VbVbI(Q6_V_lo_W(v_W_pair), lut, 0); + HVX_Vector v_W1 = Q6_Vb_vlut32_VbVbI(Q6_V_hi_W(v_W_pair), lut, 0); + v_sum0 = Q6_Vw_vrmpyacc_VwVbVb(v_sum0, v_W0, v_act[i * 2 + 0]); + v_sum1 = Q6_Vw_vrmpyacc_VwVbVb(v_sum1, v_W1, v_act[i * 2 + 1]); + } + + return Q6_Vw_vadd_VwVw(v_sum0, v_sum1); +} + +static inline HVX_VectorPair accum_4bit_32x2( + const HVX_Vector * restrict vptr, + const HVX_Vector * restrict v_act0, + const HVX_Vector * restrict v_act1, + HVX_Vector i8 +) { + HVX_Vector v_sum0 = Q6_V_vzero(); + HVX_Vector v_sum1 = Q6_V_vzero(); + HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F); + + #pragma unroll + for (int i = 0; i < 4; i++) { + HVX_VectorPair v_W_pair = unpack_and_interleave_4bit_x2(vptr[i], mask_h4); + HVX_Vector v_W0 = Q6_Vb_vsub_VbVb(Q6_V_lo_W(v_W_pair), i8); + HVX_Vector v_W1 = Q6_Vb_vsub_VbVb(Q6_V_hi_W(v_W_pair), i8); + + v_sum0 = Q6_Vw_vrmpyacc_VwVbVb(v_sum0, v_W0, v_act0[i * 2 + 0]); + v_sum0 = Q6_Vw_vrmpyacc_VwVbVb(v_sum0, v_W1, v_act0[i * 2 + 1]); + + v_sum1 = Q6_Vw_vrmpyacc_VwVbVb(v_sum1, v_W0, v_act1[i * 2 + 0]); + v_sum1 = Q6_Vw_vrmpyacc_VwVbVb(v_sum1, v_W1, v_act1[i * 2 + 1]); + } + + return Q6_W_vcombine_VV(v_sum1, v_sum0); +} + +static inline HVX_VectorPair accum_4bit_32x2_lut( + const HVX_Vector * restrict vptr, + const HVX_Vector * restrict v_act0, + const HVX_Vector * restrict v_act1, + HVX_Vector mask_h4, + HVX_Vector lut +) { + HVX_Vector v_sum0 = Q6_V_vzero(); + HVX_Vector v_sum1 = Q6_V_vzero(); + + #pragma unroll + for (int i = 0; i < 4; i++) { + HVX_VectorPair v_W_pair = unpack_and_interleave_4bit_x2(vptr[i], mask_h4); + HVX_Vector v_W0 = Q6_Vb_vlut32_VbVbI(Q6_V_lo_W(v_W_pair), lut, 0); + HVX_Vector v_W1 = Q6_Vb_vlut32_VbVbI(Q6_V_hi_W(v_W_pair), lut, 0); + + v_sum0 = Q6_Vw_vrmpyacc_VwVbVb(v_sum0, v_W0, v_act0[i * 2 + 0]); + v_sum0 = Q6_Vw_vrmpyacc_VwVbVb(v_sum0, v_W1, v_act0[i * 2 + 1]); + + v_sum1 = Q6_Vw_vrmpyacc_VwVbVb(v_sum1, v_W0, v_act1[i * 2 + 0]); + v_sum1 = Q6_Vw_vrmpyacc_VwVbVb(v_sum1, v_W1, v_act1[i * 2 + 1]); + } + + return Q6_W_vcombine_VV(v_sum1, v_sum0); +} + +static inline HVX_Vector accum_q8_0_32x1( + const HVX_Vector * restrict vptr, + const HVX_Vector * restrict v_act +) { + HVX_Vector v_sum = Q6_V_vzero(); + #pragma unroll + for (int g = 0; g < 8; g++) { + HVX_Vector v_rot = Q6_V_vror_VR(vptr[g], 64); + HVX_Vector v_W = Q6_V_lo_W(Q6_W_vshuff_VVR(v_rot, vptr[g], -2)); + v_sum = Q6_Vw_vrmpyacc_VwVbVb(v_sum, v_W, v_act[g]); + } + return v_sum; +} + +static inline HVX_VectorPair accum_q8_0_32x2( + const HVX_Vector * restrict vptr, + const HVX_Vector * restrict v_act0, + const HVX_Vector * restrict v_act1 +) { + HVX_Vector v_sum0 = Q6_V_vzero(); + HVX_Vector v_sum1 = Q6_V_vzero(); + #pragma unroll + for (int g = 0; g < 8; g++) { + HVX_Vector v_rot = Q6_V_vror_VR(vptr[g], 64); + HVX_Vector v_W = Q6_V_lo_W(Q6_W_vshuff_VVR(v_rot, vptr[g], -2)); + v_sum0 = Q6_Vw_vrmpyacc_VwVbVb(v_sum0, v_W, v_act0[g]); + v_sum1 = Q6_Vw_vrmpyacc_VwVbVb(v_sum1, v_W, v_act1[g]); + } + return Q6_W_vcombine_VV(v_sum1, v_sum0); +} + +static void tiled_vec_dot_q4_0_32x1(const uint32_t n, float * restrict s, const void * restrict vx, const void * restrict vy, uint32_t valid_rows) { + const uint8_t * restrict tile_ptr = vx; + const uint8_t * restrict y_q = vy; + + HVX_Vector v_sum_float = Q6_V_vzero(); + HVX_Vector i8 = Q6_Vb_vsplat_R(8); + + uint32_t n_k_tiles = n / 32; + for (uint32_t kt = 0; kt < n_k_tiles; kt++) { + const HVX_Vector * restrict vptr = (const HVX_Vector *) (tile_ptr + kt * 640); + const HVX_Vector * restrict v_act = (const HVX_Vector *) (y_q + kt * 1152); + + HVX_Vector v_sum = accum_4bit_32x1(vptr, v_act, i8); + HVX_Vector v_sum_sf = Q6_Vsf_equals_Vw(v_sum); + + HVX_Vector v_scale_w = vptr[4]; + HVX_Vector v_scale_a = v_act[8]; + HVX_Vector v_scale_comb = hvx_vec_mul_f16_f16_to_f32_lower32(v_scale_w, v_scale_a); + HVX_Vector v_sum_scaled = hvx_vec_mul_f32_f32(v_sum_sf, v_scale_comb); + + v_sum_float = hvx_vec_add_f32_f32(v_sum_float, v_sum_scaled); + } + + hvx_vec_store_u(s, valid_rows * sizeof(float), v_sum_float); +} + +static void tiled_vec_dot_q4_0_32x2(const uint32_t n, float * restrict s0, float * restrict s1, const void * restrict vx, const void * restrict vy0, const void * restrict vy1, uint32_t valid_rows) { + const uint8_t * restrict tile_ptr = vx; + const uint8_t * restrict y0_q = vy0; + const uint8_t * restrict y1_q = vy1; + + HVX_Vector v_sum_float_c0 = Q6_V_vzero(); + HVX_Vector v_sum_float_c1 = Q6_V_vzero(); + HVX_Vector i8 = Q6_Vb_vsplat_R(8); + + uint32_t n_k_tiles = n / 32; + uint32_t kt = 0; + for (; kt + 1 < n_k_tiles; kt += 2) { + const HVX_Vector * restrict vptr0 = (const HVX_Vector *) (tile_ptr + (kt + 0) * 640); + const HVX_Vector * restrict v_act0_0 = (const HVX_Vector *) (y0_q + (kt + 0) * 1152); + const HVX_Vector * restrict v_act1_0 = (const HVX_Vector *) (y1_q + (kt + 0) * 1152); + + const HVX_Vector * restrict vptr1 = (const HVX_Vector *) (tile_ptr + (kt + 1) * 640); + const HVX_Vector * restrict v_act0_1 = (const HVX_Vector *) (y0_q + (kt + 1) * 1152); + const HVX_Vector * restrict v_act1_1 = (const HVX_Vector *) (y1_q + (kt + 1) * 1152); + + HVX_VectorPair v_sums0 = accum_4bit_32x2(vptr0, v_act0_0, v_act1_0, i8); + HVX_VectorPair v_sums1 = accum_4bit_32x2(vptr1, v_act0_1, v_act1_1, i8); + + HVX_Vector v_sum_c0_0 = Q6_V_lo_W(v_sums0); + HVX_Vector v_sum_c1_0 = Q6_V_hi_W(v_sums0); + HVX_Vector v_sum_c0_1 = Q6_V_lo_W(v_sums1); + HVX_Vector v_sum_c1_1 = Q6_V_hi_W(v_sums1); + + HVX_Vector v_sum_sf_c0_0 = Q6_Vsf_equals_Vw(v_sum_c0_0); + HVX_Vector v_sum_sf_c1_0 = Q6_Vsf_equals_Vw(v_sum_c1_0); + HVX_Vector v_sum_sf_c0_1 = Q6_Vsf_equals_Vw(v_sum_c0_1); + HVX_Vector v_sum_sf_c1_1 = Q6_Vsf_equals_Vw(v_sum_c1_1); + + HVX_Vector v_scale_w0 = vptr0[4]; + HVX_Vector v_scale_w1 = vptr1[4]; + HVX_Vector v_scale_a_c0_0 = v_act0_0[8]; + HVX_Vector v_scale_a_c1_0 = v_act1_0[8]; + HVX_Vector v_scale_a_c0_1 = v_act0_1[8]; + HVX_Vector v_scale_a_c1_1 = v_act1_1[8]; + + HVX_Vector v_scale_comb_c0_0 = hvx_vec_mul_f16_f16_to_f32_lower32(v_scale_w0, v_scale_a_c0_0); + HVX_Vector v_scale_comb_c1_0 = hvx_vec_mul_f16_f16_to_f32_lower32(v_scale_w0, v_scale_a_c1_0); + HVX_Vector v_scale_comb_c0_1 = hvx_vec_mul_f16_f16_to_f32_lower32(v_scale_w1, v_scale_a_c0_1); + HVX_Vector v_scale_comb_c1_1 = hvx_vec_mul_f16_f16_to_f32_lower32(v_scale_w1, v_scale_a_c1_1); + + HVX_Vector v_sum_scaled_c0_0 = hvx_vec_mul_f32_f32(v_sum_sf_c0_0, v_scale_comb_c0_0); + HVX_Vector v_sum_scaled_c1_0 = hvx_vec_mul_f32_f32(v_sum_sf_c1_0, v_scale_comb_c1_0); + HVX_Vector v_sum_scaled_c0_1 = hvx_vec_mul_f32_f32(v_sum_sf_c0_1, v_scale_comb_c0_1); + HVX_Vector v_sum_scaled_c1_1 = hvx_vec_mul_f32_f32(v_sum_sf_c1_1, v_scale_comb_c1_1); + + v_sum_float_c0 = hvx_vec_add_f32_f32(v_sum_float_c0, hvx_vec_add_f32_f32(v_sum_scaled_c0_0, v_sum_scaled_c0_1)); + v_sum_float_c1 = hvx_vec_add_f32_f32(v_sum_float_c1, hvx_vec_add_f32_f32(v_sum_scaled_c1_0, v_sum_scaled_c1_1)); + } + + for (; kt < n_k_tiles; kt++) { + const HVX_Vector * restrict vptr = (const HVX_Vector *) (tile_ptr + kt * 640); + const HVX_Vector * restrict v_act0 = (const HVX_Vector *) (y0_q + kt * 1152); + const HVX_Vector * restrict v_act1 = (const HVX_Vector *) (y1_q + kt * 1152); + + HVX_VectorPair v_sums = accum_4bit_32x2(vptr, v_act0, v_act1, i8); + HVX_Vector v_sum_c0 = Q6_V_lo_W(v_sums); + HVX_Vector v_sum_c1 = Q6_V_hi_W(v_sums); + + HVX_Vector v_sum_sf_c0 = Q6_Vsf_equals_Vw(v_sum_c0); + HVX_Vector v_sum_sf_c1 = Q6_Vsf_equals_Vw(v_sum_c1); + + HVX_Vector v_scale_w = vptr[4]; + HVX_Vector v_scale_a_c0 = v_act0[8]; + HVX_Vector v_scale_a_c1 = v_act1[8]; + + HVX_Vector v_scale_comb_c0 = hvx_vec_mul_f16_f16_to_f32_lower32(v_scale_w, v_scale_a_c0); + HVX_Vector v_scale_comb_c1 = hvx_vec_mul_f16_f16_to_f32_lower32(v_scale_w, v_scale_a_c1); + + HVX_Vector v_sum_scaled_c0 = hvx_vec_mul_f32_f32(v_sum_sf_c0, v_scale_comb_c0); + HVX_Vector v_sum_scaled_c1 = hvx_vec_mul_f32_f32(v_sum_sf_c1, v_scale_comb_c1); + + v_sum_float_c0 = hvx_vec_add_f32_f32(v_sum_float_c0, v_sum_scaled_c0); + v_sum_float_c1 = hvx_vec_add_f32_f32(v_sum_float_c1, v_sum_scaled_c1); + } + + hvx_vec_store_u(s0, valid_rows * sizeof(float), v_sum_float_c0); + hvx_vec_store_u(s1, valid_rows * sizeof(float), v_sum_float_c1); +} + +static void tiled_vec_dot_q4_1_32x1(const uint32_t n, float * restrict s, const void * restrict vx, const void * restrict vy, uint32_t valid_rows) { + const uint8_t * restrict tile_ptr = vx; + const uint8_t * restrict y_q = vy; + + HVX_Vector v_sum_float = Q6_V_vzero(); + + uint32_t n_k_tiles = n / 32; + for (uint32_t kt = 0; kt < n_k_tiles; kt++) { + const HVX_Vector * restrict vptr = (const HVX_Vector *) (tile_ptr + kt * 640); + const HVX_Vector * restrict v_act = (const HVX_Vector *) (y_q + kt * 1280); + + HVX_Vector v_sum = accum_4bit_32x1(vptr, v_act, Q6_V_vzero()); + HVX_Vector v_sum_sf = Q6_Vsf_equals_Vw(v_sum); + + HVX_Vector v_scale_offset = vptr[4]; + HVX_VectorPair p_deal = Q6_W_vdeal_VVR(v_scale_offset, v_scale_offset, -2); + HVX_Vector v_scale = Q6_V_lo_W(p_deal); + HVX_Vector v_offset = Q6_V_hi_W(p_deal); + + HVX_Vector v_scale_a = v_act[8]; + HVX_Vector v_sum_a = v_act[9]; + + HVX_Vector v_scale_comb = hvx_vec_mul_f16_f16_to_f32_lower32(v_scale, v_scale_a); + HVX_Vector v_offset_comb = hvx_vec_mul_f16_f16_to_f32_lower32(v_offset, v_sum_a); + + HVX_Vector v_scaled_dot = hvx_vec_mul_f32_f32(v_sum_sf, v_scale_comb); + HVX_Vector v_sum_scaled = hvx_vec_add_f32_f32(v_scaled_dot, v_offset_comb); + + v_sum_float = hvx_vec_add_f32_f32(v_sum_float, v_sum_scaled); + } + + hvx_vec_store_u(s, valid_rows * sizeof(float), v_sum_float); +} + +static void tiled_vec_dot_q4_1_32x2(const uint32_t n, float * restrict s0, float * restrict s1, const void * restrict vx, const void * restrict vy0, const void * restrict vy1, uint32_t valid_rows) { + const uint8_t * restrict tile_ptr = vx; + const uint8_t * restrict y0_q = vy0; + const uint8_t * restrict y1_q = vy1; + + HVX_Vector v_sum_float_c0 = Q6_V_vzero(); + HVX_Vector v_sum_float_c1 = Q6_V_vzero(); + + uint32_t n_k_tiles = n / 32; + uint32_t kt = 0; + for (; kt + 1 < n_k_tiles; kt += 2) { + const HVX_Vector * restrict vptr0 = (const HVX_Vector *) (tile_ptr + (kt + 0) * 640); + const HVX_Vector * restrict v_act0_0 = (const HVX_Vector *) (y0_q + (kt + 0) * 1280); + const HVX_Vector * restrict v_act1_0 = (const HVX_Vector *) (y1_q + (kt + 0) * 1280); + + const HVX_Vector * restrict vptr1 = (const HVX_Vector *) (tile_ptr + (kt + 1) * 640); + const HVX_Vector * restrict v_act0_1 = (const HVX_Vector *) (y0_q + (kt + 1) * 1280); + const HVX_Vector * restrict v_act1_1 = (const HVX_Vector *) (y1_q + (kt + 1) * 1280); + + HVX_VectorPair v_sums0 = accum_4bit_32x2(vptr0, v_act0_0, v_act1_0, Q6_V_vzero()); + HVX_VectorPair v_sums1 = accum_4bit_32x2(vptr1, v_act0_1, v_act1_1, Q6_V_vzero()); + + HVX_Vector v_sum_c0_0 = Q6_V_lo_W(v_sums0); + HVX_Vector v_sum_c1_0 = Q6_V_hi_W(v_sums0); + HVX_Vector v_sum_c0_1 = Q6_V_lo_W(v_sums1); + HVX_Vector v_sum_c1_1 = Q6_V_hi_W(v_sums1); + + HVX_Vector v_sum_sf_c0_0 = Q6_Vsf_equals_Vw(v_sum_c0_0); + HVX_Vector v_sum_sf_c1_0 = Q6_Vsf_equals_Vw(v_sum_c1_0); + HVX_Vector v_sum_sf_c0_1 = Q6_Vsf_equals_Vw(v_sum_c0_1); + HVX_Vector v_sum_sf_c1_1 = Q6_Vsf_equals_Vw(v_sum_c1_1); + + HVX_Vector v_scale_offset0 = vptr0[4]; + HVX_VectorPair p_deal0 = Q6_W_vdeal_VVR(v_scale_offset0, v_scale_offset0, -2); + HVX_Vector v_scale0 = Q6_V_lo_W(p_deal0); + HVX_Vector v_offset0 = Q6_V_hi_W(p_deal0); + + HVX_Vector v_scale_offset1 = vptr1[4]; + HVX_VectorPair p_deal1 = Q6_W_vdeal_VVR(v_scale_offset1, v_scale_offset1, -2); + HVX_Vector v_scale1 = Q6_V_lo_W(p_deal1); + HVX_Vector v_offset1 = Q6_V_hi_W(p_deal1); + + HVX_Vector v_scale_a_c0_0 = v_act0_0[8]; + HVX_Vector v_sum_a_c0_0 = v_act0_0[9]; + HVX_Vector v_scale_a_c1_0 = v_act1_0[8]; + HVX_Vector v_sum_a_c1_0 = v_act1_0[9]; + + HVX_Vector v_scale_a_c0_1 = v_act0_1[8]; + HVX_Vector v_sum_a_c0_1 = v_act0_1[9]; + HVX_Vector v_scale_a_c1_1 = v_act1_1[8]; + HVX_Vector v_sum_a_c1_1 = v_act1_1[9]; + + HVX_Vector v_scale_comb_c0_0 = hvx_vec_mul_f16_f16_to_f32_lower32(v_scale0, v_scale_a_c0_0); + HVX_Vector v_offset_comb_c0_0 = hvx_vec_mul_f16_f16_to_f32_lower32(v_offset0, v_sum_a_c0_0); + HVX_Vector v_scale_comb_c1_0 = hvx_vec_mul_f16_f16_to_f32_lower32(v_scale0, v_scale_a_c1_0); + HVX_Vector v_offset_comb_c1_0 = hvx_vec_mul_f16_f16_to_f32_lower32(v_offset0, v_sum_a_c1_0); + + HVX_Vector v_scale_comb_c0_1 = hvx_vec_mul_f16_f16_to_f32_lower32(v_scale1, v_scale_a_c0_1); + HVX_Vector v_offset_comb_c0_1 = hvx_vec_mul_f16_f16_to_f32_lower32(v_offset1, v_sum_a_c0_1); + HVX_Vector v_scale_comb_c1_1 = hvx_vec_mul_f16_f16_to_f32_lower32(v_scale1, v_scale_a_c1_1); + HVX_Vector v_offset_comb_c1_1 = hvx_vec_mul_f16_f16_to_f32_lower32(v_offset1, v_sum_a_c1_1); + + HVX_Vector v_scaled_dot_c0_0 = hvx_vec_mul_f32_f32(v_sum_sf_c0_0, v_scale_comb_c0_0); + HVX_Vector v_sum_scaled_c0_0 = hvx_vec_add_f32_f32(v_scaled_dot_c0_0, v_offset_comb_c0_0); + + HVX_Vector v_scaled_dot_c1_0 = hvx_vec_mul_f32_f32(v_sum_sf_c1_0, v_scale_comb_c1_0); + HVX_Vector v_sum_scaled_c1_0 = hvx_vec_add_f32_f32(v_scaled_dot_c1_0, v_offset_comb_c1_0); + + HVX_Vector v_scaled_dot_c0_1 = hvx_vec_mul_f32_f32(v_sum_sf_c0_1, v_scale_comb_c0_1); + HVX_Vector v_sum_scaled_c0_1 = hvx_vec_add_f32_f32(v_scaled_dot_c0_1, v_offset_comb_c0_1); + + HVX_Vector v_scaled_dot_c1_1 = hvx_vec_mul_f32_f32(v_sum_sf_c1_1, v_scale_comb_c1_1); + HVX_Vector v_sum_scaled_c1_1 = hvx_vec_add_f32_f32(v_scaled_dot_c1_1, v_offset_comb_c1_1); + + v_sum_float_c0 = hvx_vec_add_f32_f32(v_sum_float_c0, hvx_vec_add_f32_f32(v_sum_scaled_c0_0, v_sum_scaled_c0_1)); + v_sum_float_c1 = hvx_vec_add_f32_f32(v_sum_float_c1, hvx_vec_add_f32_f32(v_sum_scaled_c1_0, v_sum_scaled_c1_1)); + } + + for (; kt < n_k_tiles; kt++) { + const HVX_Vector * restrict vptr = (const HVX_Vector *) (tile_ptr + kt * 640); + const HVX_Vector * restrict v_act0 = (const HVX_Vector *) (y0_q + kt * 1280); + const HVX_Vector * restrict v_act1 = (const HVX_Vector *) (y1_q + kt * 1280); + + HVX_VectorPair v_sums = accum_4bit_32x2(vptr, v_act0, v_act1, Q6_V_vzero()); + HVX_Vector v_sum_c0 = Q6_V_lo_W(v_sums); + HVX_Vector v_sum_c1 = Q6_V_hi_W(v_sums); + + HVX_Vector v_sum_sf_c0 = Q6_Vsf_equals_Vw(v_sum_c0); + HVX_Vector v_sum_sf_c1 = Q6_Vsf_equals_Vw(v_sum_c1); + + HVX_Vector v_scale_offset = vptr[4]; + HVX_VectorPair p_deal = Q6_W_vdeal_VVR(v_scale_offset, v_scale_offset, -2); + HVX_Vector v_scale = Q6_V_lo_W(p_deal); + HVX_Vector v_offset = Q6_V_hi_W(p_deal); + + HVX_Vector v_scale_a_c0 = v_act0[8]; + HVX_Vector v_sum_a_c0 = v_act0[9]; + HVX_Vector v_scale_a_c1 = v_act1[8]; + HVX_Vector v_sum_a_c1 = v_act1[9]; + + HVX_Vector v_scale_comb_c0 = hvx_vec_mul_f16_f16_to_f32_lower32(v_scale, v_scale_a_c0); + HVX_Vector v_offset_comb_c0 = hvx_vec_mul_f16_f16_to_f32_lower32(v_offset, v_sum_a_c0); + HVX_Vector v_scale_comb_c1 = hvx_vec_mul_f16_f16_to_f32_lower32(v_scale, v_scale_a_c1); + HVX_Vector v_offset_comb_c1 = hvx_vec_mul_f16_f16_to_f32_lower32(v_offset, v_sum_a_c1); + + HVX_Vector v_scaled_dot_c0 = hvx_vec_mul_f32_f32(v_sum_sf_c0, v_scale_comb_c0); + HVX_Vector v_sum_scaled_c0 = hvx_vec_add_f32_f32(v_scaled_dot_c0, v_offset_comb_c0); + + HVX_Vector v_scaled_dot_c1 = hvx_vec_mul_f32_f32(v_sum_sf_c1, v_scale_comb_c1); + HVX_Vector v_sum_scaled_c1 = hvx_vec_add_f32_f32(v_scaled_dot_c1, v_offset_comb_c1); + + v_sum_float_c0 = hvx_vec_add_f32_f32(v_sum_float_c0, v_sum_scaled_c0); + v_sum_float_c1 = hvx_vec_add_f32_f32(v_sum_float_c1, v_sum_scaled_c1); + } + + hvx_vec_store_u(s0, valid_rows * sizeof(float), v_sum_float_c0); + hvx_vec_store_u(s1, valid_rows * sizeof(float), v_sum_float_c1); +} + +static void tiled_vec_dot_q8_0_32x1(const uint32_t n, float * restrict s, const void * restrict vx, const void * restrict vy, uint32_t valid_rows) { + const uint8_t * restrict tile_ptr = vx; + const uint8_t * restrict y_q = vy; + + HVX_Vector v_sum_float = Q6_V_vzero(); + + uint32_t n_k_tiles = n / 32; + for (uint32_t kt = 0; kt < n_k_tiles; kt++) { + const HVX_Vector * restrict vptr = (const HVX_Vector *) (tile_ptr + kt * 1152); + const HVX_Vector * restrict v_act = (const HVX_Vector *) (y_q + kt * 1152); + + HVX_Vector v_sum = accum_q8_0_32x1(vptr, v_act); + HVX_Vector v_sum_sf = Q6_Vsf_equals_Vw(v_sum); + + HVX_Vector v_scale_w = vptr[8]; + HVX_Vector v_scale_a = v_act[8]; + HVX_Vector v_scale_comb = hvx_vec_mul_f16_f16_to_f32_lower32(v_scale_w, v_scale_a); + HVX_Vector v_sum_scaled = hvx_vec_mul_f32_f32(v_sum_sf, v_scale_comb); + + v_sum_float = hvx_vec_add_f32_f32(v_sum_float, v_sum_scaled); + } + + hvx_vec_store_u(s, valid_rows * sizeof(float), v_sum_float); +} + +static void tiled_vec_dot_q8_0_32x2(const uint32_t n, float * restrict s0, float * restrict s1, const void * restrict vx, const void * restrict vy0, const void * restrict vy1, uint32_t valid_rows) { + const uint8_t * restrict tile_ptr = vx; + const uint8_t * restrict y0_q = vy0; + const uint8_t * restrict y1_q = vy1; + + HVX_Vector v_sum_float_c0 = Q6_V_vzero(); + HVX_Vector v_sum_float_c1 = Q6_V_vzero(); + + uint32_t n_k_tiles = n / 32; + uint32_t kt = 0; + for (; kt + 1 < n_k_tiles; kt += 2) { + const HVX_Vector * restrict vptr0 = (const HVX_Vector *) (tile_ptr + (kt + 0) * 1152); + const HVX_Vector * restrict v_act0_0 = (const HVX_Vector *) (y0_q + (kt + 0) * 1152); + const HVX_Vector * restrict v_act1_0 = (const HVX_Vector *) (y1_q + (kt + 0) * 1152); + + const HVX_Vector * restrict vptr1 = (const HVX_Vector *) (tile_ptr + (kt + 1) * 1152); + const HVX_Vector * restrict v_act0_1 = (const HVX_Vector *) (y0_q + (kt + 1) * 1152); + const HVX_Vector * restrict v_act1_1 = (const HVX_Vector *) (y1_q + (kt + 1) * 1152); + + HVX_VectorPair v_sums0 = accum_q8_0_32x2(vptr0, v_act0_0, v_act1_0); + HVX_VectorPair v_sums1 = accum_q8_0_32x2(vptr1, v_act0_1, v_act1_1); + + HVX_Vector v_sum_c0_0 = Q6_V_lo_W(v_sums0); + HVX_Vector v_sum_c1_0 = Q6_V_hi_W(v_sums0); + HVX_Vector v_sum_c0_1 = Q6_V_lo_W(v_sums1); + HVX_Vector v_sum_c1_1 = Q6_V_hi_W(v_sums1); + + HVX_Vector v_sum_sf_c0_0 = Q6_Vsf_equals_Vw(v_sum_c0_0); + HVX_Vector v_sum_sf_c1_0 = Q6_Vsf_equals_Vw(v_sum_c1_0); + HVX_Vector v_sum_sf_c0_1 = Q6_Vsf_equals_Vw(v_sum_c0_1); + HVX_Vector v_sum_sf_c1_1 = Q6_Vsf_equals_Vw(v_sum_c1_1); + + HVX_Vector v_scale_w0 = vptr0[8]; + HVX_Vector v_scale_w1 = vptr1[8]; + HVX_Vector v_scale_a_c0_0 = v_act0_0[8]; + HVX_Vector v_scale_a_c1_0 = v_act1_0[8]; + HVX_Vector v_scale_a_c0_1 = v_act0_1[8]; + HVX_Vector v_scale_a_c1_1 = v_act1_1[8]; + + HVX_Vector v_scale_comb_c0_0 = hvx_vec_mul_f16_f16_to_f32_lower32(v_scale_w0, v_scale_a_c0_0); + HVX_Vector v_scale_comb_c1_0 = hvx_vec_mul_f16_f16_to_f32_lower32(v_scale_w0, v_scale_a_c1_0); + HVX_Vector v_scale_comb_c0_1 = hvx_vec_mul_f16_f16_to_f32_lower32(v_scale_w1, v_scale_a_c0_1); + HVX_Vector v_scale_comb_c1_1 = hvx_vec_mul_f16_f16_to_f32_lower32(v_scale_w1, v_scale_a_c1_1); + + HVX_Vector v_sum_scaled_c0_0 = hvx_vec_mul_f32_f32(v_sum_sf_c0_0, v_scale_comb_c0_0); + HVX_Vector v_sum_scaled_c1_0 = hvx_vec_mul_f32_f32(v_sum_sf_c1_0, v_scale_comb_c1_0); + HVX_Vector v_sum_scaled_c0_1 = hvx_vec_mul_f32_f32(v_sum_sf_c0_1, v_scale_comb_c0_1); + HVX_Vector v_sum_scaled_c1_1 = hvx_vec_mul_f32_f32(v_sum_sf_c1_1, v_scale_comb_c1_1); + + v_sum_float_c0 = hvx_vec_add_f32_f32(v_sum_float_c0, hvx_vec_add_f32_f32(v_sum_scaled_c0_0, v_sum_scaled_c0_1)); + v_sum_float_c1 = hvx_vec_add_f32_f32(v_sum_float_c1, hvx_vec_add_f32_f32(v_sum_scaled_c1_0, v_sum_scaled_c1_1)); + } + + for (; kt < n_k_tiles; kt++) { + const HVX_Vector * restrict vptr = (const HVX_Vector *) (tile_ptr + kt * 1152); + const HVX_Vector * restrict v_act0 = (const HVX_Vector *) (y0_q + kt * 1152); + const HVX_Vector * restrict v_act1 = (const HVX_Vector *) (y1_q + kt * 1152); + + HVX_VectorPair v_sums = accum_q8_0_32x2(vptr, v_act0, v_act1); + HVX_Vector v_sum_c0 = Q6_V_lo_W(v_sums); + HVX_Vector v_sum_c1 = Q6_V_hi_W(v_sums); + + HVX_Vector v_sum_sf_c0 = Q6_Vsf_equals_Vw(v_sum_c0); + HVX_Vector v_sum_sf_c1 = Q6_Vsf_equals_Vw(v_sum_c1); + + HVX_Vector v_scale_w = vptr[8]; + HVX_Vector v_scale_a_c0 = v_act0[8]; + HVX_Vector v_scale_a_c1 = v_act1[8]; + + HVX_Vector v_scale_comb_c0 = hvx_vec_mul_f16_f16_to_f32_lower32(v_scale_w, v_scale_a_c0); + HVX_Vector v_scale_comb_c1 = hvx_vec_mul_f16_f16_to_f32_lower32(v_scale_w, v_scale_a_c1); + + HVX_Vector v_sum_scaled_c0 = hvx_vec_mul_f32_f32(v_sum_sf_c0, v_scale_comb_c0); + HVX_Vector v_sum_scaled_c1 = hvx_vec_mul_f32_f32(v_sum_sf_c1, v_scale_comb_c1); + + v_sum_float_c0 = hvx_vec_add_f32_f32(v_sum_float_c0, v_sum_scaled_c0); + v_sum_float_c1 = hvx_vec_add_f32_f32(v_sum_float_c1, v_sum_scaled_c1); + } + + hvx_vec_store_u(s0, valid_rows * sizeof(float), v_sum_float_c0); + hvx_vec_store_u(s1, valid_rows * sizeof(float), v_sum_float_c1); +} + +static void tiled_vec_dot_iq4nl_32x1(const uint32_t n, float * restrict s, const void * restrict vx, const void * restrict vy, uint32_t valid_rows) { + const uint8_t * restrict tile_ptr = vx; + const uint8_t * restrict y_q = vy; + + HVX_Vector v_sum_float = Q6_V_vzero(); + HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F); + HVX_Vector lut = *(const HVX_Vector *) kvalues_iq4nl_lut; + + uint32_t n_k_tiles = n / 32; + for (uint32_t kt = 0; kt < n_k_tiles; kt++) { + const HVX_Vector * restrict vptr = (const HVX_Vector *) (tile_ptr + kt * 640); + const HVX_Vector * restrict v_act = (const HVX_Vector *) (y_q + kt * 1152); + + HVX_Vector v_sum = accum_4bit_32x1_lut(vptr, v_act, mask_h4, lut); + HVX_Vector v_sum_sf = Q6_Vsf_equals_Vw(v_sum); + + HVX_Vector v_scale_w = vptr[4]; + HVX_Vector v_scale_a = v_act[8]; + HVX_Vector v_scale_comb = hvx_vec_mul_f16_f16_to_f32_lower32(v_scale_w, v_scale_a); + HVX_Vector v_sum_scaled = hvx_vec_mul_f32_f32(v_sum_sf, v_scale_comb); + + v_sum_float = hvx_vec_add_f32_f32(v_sum_float, v_sum_scaled); + } + + hvx_vec_store_u(s, valid_rows * sizeof(float), v_sum_float); +} + +static void tiled_vec_dot_iq4nl_32x2(const uint32_t n, float * restrict s0, float * restrict s1, const void * restrict vx, const void * restrict vy0, const void * restrict vy1, uint32_t valid_rows) { + const uint8_t * restrict tile_ptr = vx; + const uint8_t * restrict y0_q = vy0; + const uint8_t * restrict y1_q = vy1; + + HVX_Vector v_sum_float_c0 = Q6_V_vzero(); + HVX_Vector v_sum_float_c1 = Q6_V_vzero(); + HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F); + HVX_Vector lut = *(const HVX_Vector *) kvalues_iq4nl_lut; + + uint32_t n_k_tiles = n / 32; + uint32_t kt = 0; + for (; kt + 1 < n_k_tiles; kt += 2) { + const HVX_Vector * restrict vptr0 = (const HVX_Vector *) (tile_ptr + (kt + 0) * 640); + const HVX_Vector * restrict v_act0_0 = (const HVX_Vector *) (y0_q + (kt + 0) * 1152); + const HVX_Vector * restrict v_act1_0 = (const HVX_Vector *) (y1_q + (kt + 0) * 1152); + + const HVX_Vector * restrict vptr1 = (const HVX_Vector *) (tile_ptr + (kt + 1) * 640); + const HVX_Vector * restrict v_act0_1 = (const HVX_Vector *) (y0_q + (kt + 1) * 1152); + const HVX_Vector * restrict v_act1_1 = (const HVX_Vector *) (y1_q + (kt + 1) * 1152); + + HVX_VectorPair v_sums0 = accum_4bit_32x2_lut(vptr0, v_act0_0, v_act1_0, mask_h4, lut); + HVX_VectorPair v_sums1 = accum_4bit_32x2_lut(vptr1, v_act0_1, v_act1_1, mask_h4, lut); + + HVX_Vector v_sum_c0_0 = Q6_V_lo_W(v_sums0); + HVX_Vector v_sum_c1_0 = Q6_V_hi_W(v_sums0); + HVX_Vector v_sum_c0_1 = Q6_V_lo_W(v_sums1); + HVX_Vector v_sum_c1_1 = Q6_V_hi_W(v_sums1); + + HVX_Vector v_sum_sf_c0_0 = Q6_Vsf_equals_Vw(v_sum_c0_0); + HVX_Vector v_sum_sf_c1_0 = Q6_Vsf_equals_Vw(v_sum_c1_0); + HVX_Vector v_sum_sf_c0_1 = Q6_Vsf_equals_Vw(v_sum_c0_1); + HVX_Vector v_sum_sf_c1_1 = Q6_Vsf_equals_Vw(v_sum_c1_1); + + HVX_Vector v_scale_w0 = vptr0[4]; + HVX_Vector v_scale_w1 = vptr1[4]; + HVX_Vector v_scale_a_c0_0 = v_act0_0[8]; + HVX_Vector v_scale_a_c1_0 = v_act1_0[8]; + HVX_Vector v_scale_a_c0_1 = v_act0_1[8]; + HVX_Vector v_scale_a_c1_1 = v_act1_1[8]; + + HVX_Vector v_scale_comb_c0_0 = hvx_vec_mul_f16_f16_to_f32_lower32(v_scale_w0, v_scale_a_c0_0); + HVX_Vector v_scale_comb_c1_0 = hvx_vec_mul_f16_f16_to_f32_lower32(v_scale_w0, v_scale_a_c1_0); + HVX_Vector v_scale_comb_c0_1 = hvx_vec_mul_f16_f16_to_f32_lower32(v_scale_w1, v_scale_a_c0_1); + HVX_Vector v_scale_comb_c1_1 = hvx_vec_mul_f16_f16_to_f32_lower32(v_scale_w1, v_scale_a_c1_1); + + HVX_Vector v_sum_scaled_c0_0 = hvx_vec_mul_f32_f32(v_sum_sf_c0_0, v_scale_comb_c0_0); + HVX_Vector v_sum_scaled_c1_0 = hvx_vec_mul_f32_f32(v_sum_sf_c1_0, v_scale_comb_c1_0); + HVX_Vector v_sum_scaled_c0_1 = hvx_vec_mul_f32_f32(v_sum_sf_c0_1, v_scale_comb_c0_1); + HVX_Vector v_sum_scaled_c1_1 = hvx_vec_mul_f32_f32(v_sum_sf_c1_1, v_scale_comb_c1_1); + + v_sum_float_c0 = hvx_vec_add_f32_f32(v_sum_float_c0, hvx_vec_add_f32_f32(v_sum_scaled_c0_0, v_sum_scaled_c0_1)); + v_sum_float_c1 = hvx_vec_add_f32_f32(v_sum_float_c1, hvx_vec_add_f32_f32(v_sum_scaled_c1_0, v_sum_scaled_c1_1)); + } + + for (; kt < n_k_tiles; kt++) { + const HVX_Vector * restrict vptr = (const HVX_Vector *) (tile_ptr + kt * 640); + const HVX_Vector * restrict v_act0 = (const HVX_Vector *) (y0_q + kt * 1152); + const HVX_Vector * restrict v_act1 = (const HVX_Vector *) (y1_q + kt * 1152); + + HVX_VectorPair v_sums = accum_4bit_32x2_lut(vptr, v_act0, v_act1, mask_h4, lut); + HVX_Vector v_sum_c0 = Q6_V_lo_W(v_sums); + HVX_Vector v_sum_c1 = Q6_V_hi_W(v_sums); + + HVX_Vector v_sum_sf_c0 = Q6_Vsf_equals_Vw(v_sum_c0); + HVX_Vector v_sum_sf_c1 = Q6_Vsf_equals_Vw(v_sum_c1); + + HVX_Vector v_scale_w = vptr[4]; + HVX_Vector v_scale_a_c0 = v_act0[8]; + HVX_Vector v_scale_a_c1 = v_act1[8]; + + HVX_Vector v_scale_comb_c0 = hvx_vec_mul_f16_f16_to_f32_lower32(v_scale_w, v_scale_a_c0); + HVX_Vector v_scale_comb_c1 = hvx_vec_mul_f16_f16_to_f32_lower32(v_scale_w, v_scale_a_c1); + + HVX_Vector v_sum_scaled_c0 = hvx_vec_mul_f32_f32(v_sum_sf_c0, v_scale_comb_c0); + HVX_Vector v_sum_scaled_c1 = hvx_vec_mul_f32_f32(v_sum_sf_c1, v_scale_comb_c1); + + v_sum_float_c0 = hvx_vec_add_f32_f32(v_sum_float_c0, v_sum_scaled_c0); + v_sum_float_c1 = hvx_vec_add_f32_f32(v_sum_float_c1, v_sum_scaled_c1); + } + + hvx_vec_store_u(s0, valid_rows * sizeof(float), v_sum_float_c0); + hvx_vec_store_u(s1, valid_rows * sizeof(float), v_sum_float_c1); +} + +static void tiled_vec_dot_mxfp4_32x1(const uint32_t n, float * restrict s, const void * restrict vx, const void * restrict vy, uint32_t valid_rows) { + const uint8_t * restrict tile_ptr = vx; + const uint8_t * restrict y_q = vy; + + HVX_Vector v_sum_float = Q6_V_vzero(); + HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F); + HVX_Vector lut = *(const HVX_Vector *) kvalues_mxfp4_lut; + HVX_Vector expand = *(const HVX_Vector *) expand_x32_e8m0; + HVX_Vector e8m0_mask = Q6_V_vsplat_R(0x000000ff); + + uint32_t n_k_tiles = n / 32; + for (uint32_t kt = 0; kt < n_k_tiles; kt++) { + const HVX_Vector * restrict vptr = (const HVX_Vector *) (tile_ptr + kt * 640); + const HVX_Vector * restrict v_act = (const HVX_Vector *) (y_q + kt * 1152); + + HVX_Vector v_sum = accum_4bit_32x1_lut(vptr, v_act, mask_h4, lut); + HVX_Vector v_sum_sf = Q6_Vsf_equals_Vw(v_sum); + + HVX_Vector v_scale_w = hvx_vmem(tile_ptr + kt * 640 + 512); + HVX_Vector r0_d = Q6_V_vdelta_VV(v_scale_w, expand); + r0_d = Q6_V_vand_VV(r0_d, e8m0_mask); + HVX_Vector v_scale_w_f32 = Q6_Vw_vasl_VwR(r0_d, 23); + + HVX_Vector v_scale_a_f16 = v_act[8]; + HVX_VectorPair p_scale_a_f32 = hvx_vec_f16_to_f32_shuff(v_scale_a_f16); + HVX_Vector v_scale_a = Q6_V_lo_W(p_scale_a_f32); + + HVX_Vector v_scale_comb = hvx_vec_mul_f32_f32(v_scale_w_f32, v_scale_a); + HVX_Vector v_sum_scaled = hvx_vec_mul_f32_f32(v_sum_sf, v_scale_comb); + + v_sum_float = hvx_vec_add_f32_f32(v_sum_float, v_sum_scaled); + } + + v_sum_float = hvx_vec_mul_f32_f32(v_sum_float, hvx_vec_splat_f32(0.5f)); + + hvx_vec_store_u(s, valid_rows * sizeof(float), v_sum_float); +} + +static void tiled_vec_dot_mxfp4_32x2(const uint32_t n, float * restrict s0, float * restrict s1, const void * restrict vx, const void * restrict vy0, const void * restrict vy1, uint32_t valid_rows) { + const uint8_t * restrict tile_ptr = vx; + const uint8_t * restrict y0_q = vy0; + const uint8_t * restrict y1_q = vy1; + + HVX_Vector v_sum_float_c0 = Q6_V_vzero(); + HVX_Vector v_sum_float_c1 = Q6_V_vzero(); + HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F); + HVX_Vector lut = *(const HVX_Vector *) kvalues_mxfp4_lut; + HVX_Vector expand = *(const HVX_Vector *) expand_x32_e8m0; + HVX_Vector e8m0_mask = Q6_V_vsplat_R(0x000000ff); + + uint32_t n_k_tiles = n / 32; + uint32_t kt = 0; + for (; kt + 1 < n_k_tiles; kt += 2) { + const HVX_Vector * restrict vptr0 = (const HVX_Vector *) (tile_ptr + (kt + 0) * 640); + const HVX_Vector * restrict v_act0_0 = (const HVX_Vector *) (y0_q + (kt + 0) * 1152); + const HVX_Vector * restrict v_act1_0 = (const HVX_Vector *) (y1_q + (kt + 0) * 1152); + + const HVX_Vector * restrict vptr1 = (const HVX_Vector *) (tile_ptr + (kt + 1) * 640); + const HVX_Vector * restrict v_act0_1 = (const HVX_Vector *) (y0_q + (kt + 1) * 1152); + const HVX_Vector * restrict v_act1_1 = (const HVX_Vector *) (y1_q + (kt + 1) * 1152); + + HVX_VectorPair v_sums0 = accum_4bit_32x2_lut(vptr0, v_act0_0, v_act1_0, mask_h4, lut); + HVX_VectorPair v_sums1 = accum_4bit_32x2_lut(vptr1, v_act0_1, v_act1_1, mask_h4, lut); + + HVX_Vector v_sum_c0_0 = Q6_V_lo_W(v_sums0); + HVX_Vector v_sum_c1_0 = Q6_V_hi_W(v_sums0); + HVX_Vector v_sum_c0_1 = Q6_V_lo_W(v_sums1); + HVX_Vector v_sum_c1_1 = Q6_V_hi_W(v_sums1); + + HVX_Vector v_sum_sf_c0_0 = Q6_Vsf_equals_Vw(v_sum_c0_0); + HVX_Vector v_sum_sf_c1_0 = Q6_Vsf_equals_Vw(v_sum_c1_0); + HVX_Vector v_sum_sf_c0_1 = Q6_Vsf_equals_Vw(v_sum_c0_1); + HVX_Vector v_sum_sf_c1_1 = Q6_Vsf_equals_Vw(v_sum_c1_1); + + HVX_Vector v_scale_w0 = hvx_vmem(tile_ptr + (kt + 0) * 640 + 512); + HVX_Vector r0_d0 = Q6_V_vdelta_VV(v_scale_w0, expand); + r0_d0 = Q6_V_vand_VV(r0_d0, e8m0_mask); + HVX_Vector v_scale_w_f32_0 = Q6_Vw_vasl_VwR(r0_d0, 23); + + HVX_Vector v_scale_w1 = hvx_vmem(tile_ptr + (kt + 1) * 640 + 512); + HVX_Vector r0_d1 = Q6_V_vdelta_VV(v_scale_w1, expand); + r0_d1 = Q6_V_vand_VV(r0_d1, e8m0_mask); + HVX_Vector v_scale_w_f32_1 = Q6_Vw_vasl_VwR(r0_d1, 23); + + HVX_Vector v_scale_a_c0_f16_0 = v_act0_0[8]; + HVX_Vector v_scale_a_c1_f16_0 = v_act1_0[8]; + HVX_Vector v_scale_a_c0_f16_1 = v_act0_1[8]; + HVX_Vector v_scale_a_c1_f16_1 = v_act1_1[8]; + + HVX_VectorPair p_scale_a_c0_f32_0 = hvx_vec_f16_to_f32_shuff(v_scale_a_c0_f16_0); + HVX_VectorPair p_scale_a_c1_f32_0 = hvx_vec_f16_to_f32_shuff(v_scale_a_c1_f16_0); + HVX_VectorPair p_scale_a_c0_f32_1 = hvx_vec_f16_to_f32_shuff(v_scale_a_c0_f16_1); + HVX_VectorPair p_scale_a_c1_f32_1 = hvx_vec_f16_to_f32_shuff(v_scale_a_c1_f16_1); + + HVX_Vector v_scale_a_c0_0 = Q6_V_lo_W(p_scale_a_c0_f32_0); + HVX_Vector v_scale_a_c1_0 = Q6_V_lo_W(p_scale_a_c1_f32_0); + HVX_Vector v_scale_a_c0_1 = Q6_V_lo_W(p_scale_a_c0_f32_1); + HVX_Vector v_scale_a_c1_1 = Q6_V_lo_W(p_scale_a_c1_f32_1); + + HVX_Vector v_scale_comb_c0_0 = hvx_vec_mul_f32_f32(v_scale_w_f32_0, v_scale_a_c0_0); + HVX_Vector v_scale_comb_c1_0 = hvx_vec_mul_f32_f32(v_scale_w_f32_0, v_scale_a_c1_0); + HVX_Vector v_scale_comb_c0_1 = hvx_vec_mul_f32_f32(v_scale_w_f32_1, v_scale_a_c0_1); + HVX_Vector v_scale_comb_c1_1 = hvx_vec_mul_f32_f32(v_scale_w_f32_1, v_scale_a_c1_1); + + HVX_Vector v_sum_scaled_c0_0 = hvx_vec_mul_f32_f32(v_sum_sf_c0_0, v_scale_comb_c0_0); + HVX_Vector v_sum_scaled_c1_0 = hvx_vec_mul_f32_f32(v_sum_sf_c1_0, v_scale_comb_c1_0); + HVX_Vector v_sum_scaled_c0_1 = hvx_vec_mul_f32_f32(v_sum_sf_c0_1, v_scale_comb_c0_1); + HVX_Vector v_sum_scaled_c1_1 = hvx_vec_mul_f32_f32(v_sum_sf_c1_1, v_scale_comb_c1_1); + + v_sum_float_c0 = hvx_vec_add_f32_f32(v_sum_float_c0, hvx_vec_add_f32_f32(v_sum_scaled_c0_0, v_sum_scaled_c0_1)); + v_sum_float_c1 = hvx_vec_add_f32_f32(v_sum_float_c1, hvx_vec_add_f32_f32(v_sum_scaled_c1_0, v_sum_scaled_c1_1)); + } + + for (; kt < n_k_tiles; kt++) { + const HVX_Vector * restrict vptr = (const HVX_Vector *) (tile_ptr + kt * 640); + const HVX_Vector * restrict v_act0 = (const HVX_Vector *) (y0_q + kt * 1152); + const HVX_Vector * restrict v_act1 = (const HVX_Vector *) (y1_q + kt * 1152); + + HVX_VectorPair v_sums = accum_4bit_32x2_lut(vptr, v_act0, v_act1, mask_h4, lut); + HVX_Vector v_sum_c0 = Q6_V_lo_W(v_sums); + HVX_Vector v_sum_c1 = Q6_V_hi_W(v_sums); + + HVX_Vector v_sum_sf_c0 = Q6_Vsf_equals_Vw(v_sum_c0); + HVX_Vector v_sum_sf_c1 = Q6_Vsf_equals_Vw(v_sum_c1); + + HVX_Vector v_scale_w = hvx_vmem(tile_ptr + kt * 640 + 512); + HVX_Vector r0_d = Q6_V_vdelta_VV(v_scale_w, expand); + r0_d = Q6_V_vand_VV(r0_d, e8m0_mask); + HVX_Vector v_scale_w_f32 = Q6_Vw_vasl_VwR(r0_d, 23); + + HVX_Vector v_scale_a_c0_f16 = v_act0[8]; + HVX_Vector v_scale_a_c1_f16 = v_act1[8]; + + HVX_VectorPair p_scale_a_c0_f32 = hvx_vec_f16_to_f32_shuff(v_scale_a_c0_f16); + HVX_VectorPair p_scale_a_c1_f32 = hvx_vec_f16_to_f32_shuff(v_scale_a_c1_f16); + + HVX_Vector v_scale_a_c0 = Q6_V_lo_W(p_scale_a_c0_f32); + HVX_Vector v_scale_a_c1 = Q6_V_lo_W(p_scale_a_c1_f32); + + HVX_Vector v_scale_comb_c0 = hvx_vec_mul_f32_f32(v_scale_w_f32, v_scale_a_c0); + HVX_Vector v_scale_comb_c1 = hvx_vec_mul_f32_f32(v_scale_w_f32, v_scale_a_c1); + + HVX_Vector v_sum_scaled_c0 = hvx_vec_mul_f32_f32(v_sum_sf_c0, v_scale_comb_c0); + HVX_Vector v_sum_scaled_c1 = hvx_vec_mul_f32_f32(v_sum_sf_c1, v_scale_comb_c1); + + v_sum_float_c0 = hvx_vec_add_f32_f32(v_sum_float_c0, v_sum_scaled_c0); + v_sum_float_c1 = hvx_vec_add_f32_f32(v_sum_float_c1, v_sum_scaled_c1); + } + + v_sum_float_c0 = hvx_vec_mul_f32_f32(v_sum_float_c0, hvx_vec_splat_f32(0.5f)); + v_sum_float_c1 = hvx_vec_mul_f32_f32(v_sum_float_c1, hvx_vec_splat_f32(0.5f)); + + hvx_vec_store_u(s0, valid_rows * sizeof(float), v_sum_float_c0); + hvx_vec_store_u(s1, valid_rows * sizeof(float), v_sum_float_c1); +} + +static inline void quantize_f32_q8_0_tiled_kernel( + const uint8_t * restrict src_data, + uint8_t * restrict dst_data, + uint8_t * restrict tmp_data, + uint32_t ne0, + uint32_t nrows, + size_t src_row_size, + size_t dst_row_size +) { + const size_t src_row_size_padded = hex_round_up(src_row_size, QK_Q8_0_TILED * sizeof(float)); + hvx_splat_f32_a(tmp_data, 0.0f, src_row_size_padded / sizeof(float)); + + for (uint32_t i = 0; i < nrows; ++i) { + hex_l2fetch(src_data, src_row_size, src_row_size, 2); + hvx_copy_f32_aa(tmp_data, src_data, ne0); + + quantize_row_f32_q8_0_tiled((float *) tmp_data, dst_data, ne0); + dst_data += dst_row_size; + src_data += src_row_size; + } +} + +static inline void quantize_f32_q8_1_tiled_kernel( + const uint8_t * restrict src_data, + uint8_t * restrict dst_data, + uint8_t * restrict tmp_data, + uint32_t ne0, + uint32_t nrows, + size_t src_row_size, + size_t dst_row_size +) { + const size_t src_row_size_padded = hex_round_up(src_row_size, QK_Q8_0_TILED * sizeof(float)); + hvx_splat_f32_a(tmp_data, 0.0f, src_row_size_padded / sizeof(float)); + + for (uint32_t i = 0; i < nrows; ++i) { + hex_l2fetch(src_data, src_row_size, src_row_size, 2); + hvx_copy_f32_aa(tmp_data, src_data, ne0); + + quantize_row_f32_q8_1_tiled((float *) tmp_data, dst_data, ne0); + dst_data += dst_row_size; + src_data += src_row_size; + } +} + +static inline void quantize_f32_q8_0_tiled_block_kernel( + const float * restrict src, + uint8_t * restrict dst, + uint8_t * restrict tmp_data, + uint32_t ne0, + uint32_t ib_first, + uint32_t ib_last, + size_t src_row_size, + size_t dst_row_size, + uint32_t r, + uint32_t c +) { + const uint32_t qk = QK_Q8_0_TILED; + const uint32_t nb = (ne0 + qk - 1) / qk; + + for (uint32_t ib = ib_first; ib < ib_last; ++ib) { + const uint8_t * restrict src_ptr = (const uint8_t *) src + r * src_row_size + c * qk * sizeof(float); + uint8_t * restrict dst_ptr = dst + r * dst_row_size + c * 4 * 1152; + + hex_l2fetch(src_ptr, qk * sizeof(float), qk * sizeof(float), 1); + + if (c == nb - 1) { + uint32_t active_elements = ne0 - c * qk; + hvx_splat_f32_a(tmp_data, 0.0f, qk); + hvx_copy_f32_aa(tmp_data, src_ptr, active_elements); + } else { + hvx_copy_f32_aa(tmp_data, src_ptr, qk); + } + + quantize_block_f32_q8_0_tiled((float *) tmp_data, dst_ptr); + + c++; + if (c == nb) { + c = 0; + r++; + } + } +} + +static inline void quantize_f32_q8_1_tiled_block_kernel( + const float * restrict src, + uint8_t * restrict dst, + uint8_t * restrict tmp_data, + uint32_t ne0, + uint32_t ib_first, + uint32_t ib_last, + size_t src_row_size, + size_t dst_row_size, + uint32_t r, + uint32_t c +) { + const uint32_t qk = QK_Q8_0_TILED; + const uint32_t nb = (ne0 + qk - 1) / qk; + + for (uint32_t ib = ib_first; ib < ib_last; ++ib) { + const uint8_t * restrict src_ptr = (const uint8_t *) src + r * src_row_size + c * qk * sizeof(float); + uint8_t * restrict dst_ptr = dst + r * dst_row_size + c * 4 * 1280; + + hex_l2fetch(src_ptr, qk * sizeof(float), qk * sizeof(float), 1); + + if (c == nb - 1) { + uint32_t active_elements = ne0 - c * qk; + hvx_splat_f32_a(tmp_data, 0.0f, qk); + hvx_copy_f32_aa(tmp_data, src_ptr, active_elements); + } else { + hvx_copy_f32_aa(tmp_data, src_ptr, qk); + } + + quantize_block_f32_q8_1_tiled((float *) tmp_data, dst_ptr); + + c++; + if (c == nb) { + c = 0; + r++; + } + } +} diff --git a/ggml/src/ggml-hexagon/htp/main.c b/ggml/src/ggml-hexagon/htp/main.c index 3715227d2c7b..d76512ea4a35 100644 --- a/ggml/src/ggml-hexagon/htp/main.c +++ b/ggml/src/ggml-hexagon/htp/main.c @@ -361,7 +361,7 @@ static void vtcm_free(struct htp_context * ctx) { static void htp_packet_callback(dspqueue_t queue, int error, void * context); static void htp_error_callback(dspqueue_t queue, int error, void * context); -AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_queue_id, uint32 n_hvx, uint32 use_hmx, uint64_t max_vmem) { +AEEResult htp_iface_start(remote_handle64 handle, uint32_t sess_id, uint64_t dsp_queue_id, uint32_t n_hvx, uint32_t n_hmx, uint64_t max_vmem) { struct htp_context * ctx = (struct htp_context *) handle; if (!ctx) { @@ -395,18 +395,18 @@ AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_que return AEE_ENOMEMORY; } -#ifdef HTP_HAS_HMX - ctx->hmx_enabled = use_hmx; + ctx->hmx_enabled = n_hmx; ctx->hmx_queue = NULL; - if (use_hmx) { + if (n_hmx) { ctx->hmx_queue = hmx_queue_create(16, ctx->vtcm_rctx); - if (!ctx->hmx_queue) { + if (ctx->hmx_queue) { + ctx->hmx_queue->trace = &ctx->trace[HTP_MAX_NTHREADS]; + } else { FARF(ERROR, "hmx-queue-create failed"); ctx->hmx_enabled = false; } } - FARF(HIGH, "HMX %s (use_hmx=%d)", ctx->hmx_enabled ? "enabled" : "disabled", use_hmx); -#endif + FARF(HIGH, "HMX %s (n_hmx=%d)", ctx->hmx_enabled ? "enabled" : "disabled", n_hmx); qurt_sysenv_max_hthreads_t hw_threads; qurt_sysenv_get_max_hw_threads(&hw_threads); @@ -425,6 +425,9 @@ AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_que ctx->n_threads = n_hvx; for (int i = 0; i < ctx->n_threads; i++) { ctx->dma[i] = dma_queue_create(256); // queue depth + if (ctx->dma[i]) { + ctx->dma[i]->trace = &ctx->trace[i]; + } } ctx->ddr_spad_size = 512 * 1024; // 512 KB @@ -476,13 +479,11 @@ AEEResult htp_iface_stop(remote_handle64 handle) { dma_queue_delete(ctx->dma[i]); } -#ifdef HTP_HAS_HMX if (ctx->hmx_queue) { hmx_queue_delete(ctx->hmx_queue); ctx->hmx_queue = NULL; } ctx->hmx_enabled = false; -#endif vtcm_free(ctx); @@ -495,6 +496,36 @@ AEEResult htp_iface_stop(remote_handle64 handle) { return AEE_SUCCESS; } +AEEResult htp_iface_hwinfo(remote_handle64 handle, uint32_t * n_threads, uint32_t * n_hvx, uint32_t * n_hmx, uint64_t * vtcm_size) { + (void)handle; + if (!n_threads || !n_hvx || !n_hmx || !vtcm_size) { + return AEE_EBADPARM; + } + + qurt_sysenv_max_hthreads_t hw_threads; + qurt_sysenv_get_max_hw_threads(&hw_threads); + uint32_t hw_nhvx = (qurt_hvx_get_units() >> 8) & 0xFF; + + uint32_t n_hvx_val = hw_nhvx; + if (n_hvx_val > hw_threads.max_hthreads) { + n_hvx_val = hw_threads.max_hthreads; + } + if (n_hvx_val > HTP_MAX_NTHREADS) { + n_hvx_val = HTP_MAX_NTHREADS; + } + + // for now we force n_threads == n_hvx + *n_threads = n_hvx_val; + *n_hvx = n_hvx_val; + *n_hmx = 1; + + uint32_t vtcm_sz = 8 * 1024 * 1024; // 8MB default fallback + HAP_compute_res_query_VTCM(0, (unsigned int *)&vtcm_sz, NULL, NULL, NULL); + *vtcm_size = vtcm_sz; + + return AEE_SUCCESS; +} + static void htp_error_callback(dspqueue_t queue, int error, void * context) { // No errors expected on the DSP. FARF(ERROR, "Error callback: 0x%08x", (unsigned) error); @@ -502,7 +533,8 @@ static void htp_error_callback(dspqueue_t queue, int error, void * context) { struct profile_data { uint64_t usecs; - uint64_t cycles; + uint64_t cycles_start; + uint64_t cycles_stop; uint32_t pmu_counters[HEX_NUM_PMU_COUNTERS]; }; @@ -512,8 +544,9 @@ static inline void profile_start(uint32_t mode, struct profile_data * d) { hex_get_pmu(d->pmu_counters); // fallthrough case HTP_PROF_BASIC: + case HTP_PROF_TRACE: d->usecs = HAP_perf_get_qtimer_count(); - d->cycles = hex_get_cycles(); + d->cycles_start = hex_get_cycles(); break; default: break; @@ -530,8 +563,9 @@ static inline void profile_stop(uint32_t mode, struct profile_data * d) { } // fallthrough case HTP_PROF_BASIC: + case HTP_PROF_TRACE: d->usecs = HAP_perf_qtimer_count_to_us(HAP_perf_get_qtimer_count() - d->usecs); - d->cycles = hex_get_cycles() - d->cycles; + d->cycles_stop = hex_get_cycles(); break; default: break; @@ -546,6 +580,12 @@ static int execute_op(struct htp_ops_context * octx) { case HTP_OP_MUL_MAT_ID: return op_matmul_id(octx); + case HTP_OP_MUL_MAT_QKV: + return op_matmul_qkv(octx); + + case HTP_OP_MUL_MAT_FFN: + return op_matmul_ffn(octx); + case HTP_OP_MUL: case HTP_OP_ADD: case HTP_OP_SUB: @@ -754,8 +794,9 @@ static void prep_tensors(struct htp_context *ctx, struct htp_buf_desc *bufs, str } } -static void proc_op_req(struct htp_ops_context * octx, struct htp_tensor *tens, uint32_t idx, struct htp_op_desc * op) { +static int proc_op_req(struct htp_ops_context * octx, struct htp_tensor *tens, uint32_t idx, struct htp_op_desc * op) { memcpy(octx->op_params, op->params, sizeof(octx->op_params)); + memcpy(octx->kernel_params, op->kernel_params, sizeof(octx->kernel_params)); octx->flags = op->flags; octx->op = op->opcode; @@ -777,22 +818,41 @@ static void proc_op_req(struct htp_ops_context * octx, struct htp_tensor *tens, src->ne[0], src->ne[1], src->ne[3], src->ne[3]); } - // Prep output tensor - struct htp_tensor *dst = tens + op->dst; + // Prep output tensors + for (uint32_t i = 0; i < HTP_OP_MAX_OUTPUTS; i++) { + uint16_t dst_idx = op->dst[i]; + if (dst_idx == 0xffff) { + octx->dsts[i] = NULL; + continue; + } + struct htp_tensor *dst = tens + dst_idx; + octx->dsts[i] = dst; - octx->dst = dst; + FARF(HIGH, "prep-dst[%u] #%u: data %p size %u : %u:%u:%u:%u", i, dst_idx, (void*) dst->data, dst->size, + dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]); + } - FARF(HIGH, "prep-dst #%u: data %p size %u : %u:%u:%u:%u", op->dst, (void*) dst->data, dst->size, - dst->ne[0], dst->ne[1], dst->ne[3], dst->ne[3]); + int status = execute_op(octx); - (void) execute_op(octx); + octx->src0_spad.src = NULL; + octx->src1_spad.src = NULL; + octx->src2_spad.src = NULL; + octx->src3_spad.src = NULL; + octx->dst_spad.src = NULL; // flush buffers on output - hex_l2flush((void *) dst->data, dst->size); - dst->flags |= HTP_TENSOR_FLUSHED; + for (uint32_t i = 0; i < HTP_OP_MAX_OUTPUTS; i++) { + if (octx->dsts[i]) { + struct htp_tensor *dst = (struct htp_tensor *)octx->dsts[i]; + hex_l2flush((void *) dst->data, dst->size); + dst->flags |= HTP_TENSOR_FLUSHED; + + FARF(HIGH, "post-dst[%u] #%u: data %p size %u : %u:%u:%u:%u", i, op->dst[i], (void*) dst->data, dst->size, + dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]); + } + } - FARF(HIGH, "post-dst #%u: data %p size %u : %u:%u:%u:%u", op->dst, (void*) dst->data, dst->size, - dst->ne[0], dst->ne[1], dst->ne[3], dst->ne[3]); + return status; } #define DSPQUEUE_POLL_TIMEOUT_USEC 100 @@ -845,14 +905,15 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) { const uint32_t t_size = sizeof(struct htp_tensor) * n_tens; const uint32_t o_size = sizeof(struct htp_op_desc) * n_ops; const uint32_t p_size = sizeof(struct htp_prof_desc) * n_ops; + const uint32_t tr_size = (HTP_MAX_NTHREADS + 1) * req.n_traces * sizeof(struct htp_trace_desc); - if (dbuf.size < b_size + t_size + o_size + p_size) { - FARF(ERROR, "invalid opbatch memory block size %u", dbuf.size); + if (dbuf.size < b_size + t_size + o_size + p_size + tr_size) { + FARF(ERROR, "invalid opbatch memory block size %u (req %u)", dbuf.size, b_size + t_size + o_size + p_size + tr_size); break; } - FARF(HIGH, "processing opbatch #%u: n-bufs %u n-tensors %u n-ops %u : m-size %u b-size %u t-size %u o-size %u", req.id, - n_bufs, n_tens, n_ops, dbuf.size, b_size, t_size, o_size); + FARF(HIGH, "processing opbatch #%u: n-bufs %u n-tensors %u n-ops %u n-traces %u : m-size %u b-size %u t-size %u o-size %u", req.id, + n_bufs, n_tens, n_ops, req.n_traces, dbuf.size, b_size, t_size, o_size); // Setup descriptor pointers uint8_t * m_ptr = dbuf.ptr; @@ -869,24 +930,45 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) { octx->n_threads = ctx->n_threads; octx->ctx = ctx; + if (ctx->profiler == HTP_PROF_TRACE) { + memset(ctx->trace, 0, sizeof(ctx->trace)); + struct htp_trace_desc * trace_events = (struct htp_trace_desc *) (m_ptr + p_size); + for (int t = 0; t <= HTP_MAX_NTHREADS; t++) { + ctx->trace[t].events = &trace_events[t * req.n_traces]; + ctx->trace[t].max_events = req.n_traces; + } + } else { + for (int t = 0; t <= HTP_MAX_NTHREADS; t++) { + ctx->trace[t].events = NULL; + ctx->trace[t].max_events = 0; + } + } + + int op_status = HTP_STATUS_OK; + uint32_t op_wakeup = n_ops / 2; // half-way throgh the batch + for (uint32_t i=0; i < n_ops; i++) { struct profile_data prof; - if (i == (n_ops-1)) { - // wake up the host before starting the last op + if (i == op_wakeup) { dspqueue_write_early_wakeup_noblock(queue, 0, 0); } profile_start(ctx->profiler, &prof); - proc_op_req(octx, tens, i, &ops[i]); + op_status = proc_op_req(octx, tens, i, &ops[i]); profile_stop(ctx->profiler, &prof); + if (op_status != HTP_STATUS_OK) { + break; + } + if (ctx->profiler) { pds[i].opcode = ops[i].opcode; pds[i].usecs = prof.usecs; - pds[i].cycles = prof.cycles; + pds[i].cycles_start = prof.cycles_start; + pds[i].cycles_stop = prof.cycles_stop; for (int j = 0; j < HEX_NUM_PMU_COUNTERS; j++) { pds[i].pmu[j] = prof.pmu_counters[j]; } @@ -895,10 +977,18 @@ static void htp_packet_callback(dspqueue_t queue, int error, void * context) { struct htp_opbatch_rsp rsp; rsp.id = req.id; - rsp.status = HTP_STATUS_OK; + rsp.status = op_status; rsp.n_bufs = n_bufs; rsp.n_tensors = n_tens; rsp.n_ops = n_ops; + memset(rsp.pad, 0, sizeof(rsp.pad)); + if (ctx->profiler == HTP_PROF_TRACE) { + for (int t = 0; t <= HTP_MAX_NTHREADS; t++) { + rsp.n_traces[t] = ctx->trace[t].count; + } + } else { + memset(rsp.n_traces, 0, sizeof(rsp.n_traces)); + } dbuf.flags = DSPQUEUE_BUFFER_FLAG_FLUSH_SENDER | DSPQUEUE_BUFFER_FLAG_INVALIDATE_RECIPIENT; diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c index 5121c6f9badd..81a0ffbebb8d 100644 --- a/ggml/src/ggml-hexagon/htp/matmul-ops.c +++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c @@ -5,6 +5,7 @@ #include #include +#include #include #include @@ -17,2906 +18,120 @@ #include "ggml-common.h" #include "htp-ctx.h" #include "htp-ops.h" -#include "htp-ops.h" -#include "hmx-ops.h" - -#define MM_SPAD_SRC0_NROWS 16 -#define MM_SPAD_SRC1_NROWS 16 -#define MM_SPAD_DST_NROWS 2 - -struct htp_matmul_context { - const char * type; - struct htp_ops_context * octx; - - void (*vec_dot_1x1)(const int n, float * restrict s0, - const void * restrict vx0, - const void * restrict vy0); - - void (*vec_dot_2x1)(const int n, float * restrict s0, - const void * restrict vx0, const void * restrict vx1, - const void * restrict vy0); - - void (*vec_dot_2x2)(const int n, float * restrict s0, float * restrict s1, - const void * restrict vx0, const void * restrict vx1, - const void * restrict vy0, const void * restrict vy1); - - void (*vec_dot_4x1)(const int n, float * restrict s0, - const void * restrict vx0, const void * restrict vx1, - const void * restrict vx2, const void * restrict vx3, - const void * restrict vy0); - - // Precomputed values - uint32_t src0_nrows_per_thread; - uint32_t src1_nrows_per_thread; - - struct fastdiv_values mm_div_ne12_ne1; - struct fastdiv_values mm_div_ne1; - struct fastdiv_values mm_div_r2; - struct fastdiv_values mm_div_r3; - - // Fields for scattered mapping & HMX support in MUL_MAT_ID - const uint32_t * matrix_row_counts; - const struct mmid_row_mapping * matrix_rows; - bool hmx_eligible; -}; - -// vdelta control to expand first 32 e8m0 values into 32 uint32 elements -static const uint8_t __attribute__((aligned(128))) expand_x32_e8m0[128] = { - 0x00, 0x00, 0x00, 0x00, 0x01, 0x04, 0x00, 0x00, 0x02, 0x00, 0x08, 0x08, 0x01, 0x02, 0x00, 0x04, 0x04, 0x00, 0x00, - 0x00, 0x11, 0x10, 0x10, 0x10, 0x02, 0x00, 0x04, 0x00, 0x01, 0x02, 0x08, 0x08, 0x08, 0x08, 0x00, 0x00, 0x01, 0x04, - 0x00, 0x00, 0x22, 0x20, 0x20, 0x20, 0x21, 0x22, 0x20, 0x24, 0x04, 0x00, 0x00, 0x00, 0x09, 0x08, 0x00, 0x00, 0x02, - 0x00, 0x04, 0x00, 0x11, 0x12, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x01, 0x04, 0x00, 0x00, 0x02, 0x00, 0x08, 0x08, - 0x01, 0x02, 0x00, 0x04, 0x44, 0x40, 0x40, 0x40, 0x41, 0x40, 0x40, 0x40, 0x42, 0x40, 0x44, 0x40, 0x41, 0x42, 0x48, - 0x48, 0x08, 0x08, 0x00, 0x00, 0x01, 0x04, 0x00, 0x00, 0x12, 0x10, 0x10, 0x10, 0x01, 0x02, 0x00, 0x04, 0x04, 0x00, - 0x00, 0x00, 0x09, 0x08, 0x00, 0x00, 0x22, 0x20, 0x24, 0x20, 0x21, 0x22, 0x20, 0x20, -}; - -// IQ4_NL dequantization LUT: maps 4-bit index (0-15) to int8 kvalue -// kvalues: -127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113 -static const uint8_t __attribute__((aligned(VLEN))) kvalues_iq4nl_lut[] = { - 0x81, 0, 0x98, 0, 0xAD, 0, 0xBF, 0, 0xCF, 0, 0xDD, 0, 0xEA, 0, 0xF6, 0, 0x01, 0, 0x0D, 0, 0x19, 0, 0x26, 0, - 0x35, 0, 0x45, 0, 0x59, 0, 0x71, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -}; - -static const uint8_t __attribute__((aligned(VLEN))) kvalues_mxfp4_lut[] = { - 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 6, 0, 8, 0, 12, 0, 0, 0, 0xff, 0, 0xfe, 0, 0xfd, 0, 0xfc, 0, - 0xfa, 0, 0xf8, 0, 0xf4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -}; - -static inline HVX_Vector_x8 hvx_vec_load_iq4nlx4x8_full(const uint8_t * restrict ptr) { - const HVX_Vector * restrict vptr = (const HVX_Vector *) ptr; - - HVX_Vector v0_1 = vptr[0]; // first 256 elements (128 bytes) - HVX_Vector v2_3 = vptr[1]; // ... - HVX_Vector v4_5 = vptr[2]; // ... - HVX_Vector v6_7 = vptr[3]; // ... - - const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F); - const HVX_Vector lut = *(const HVX_Vector *) kvalues_iq4nl_lut; - - HVX_Vector v0 = Q6_V_vand_VV(v0_1, mask_h4); // & 0x0F - HVX_Vector v1 = Q6_Vub_vlsr_VubR(v0_1, 4); // >> 4 - HVX_Vector v2 = Q6_V_vand_VV(v2_3, mask_h4); // & 0x0F - HVX_Vector v3 = Q6_Vub_vlsr_VubR(v2_3, 4); // >> 4 - HVX_Vector v4 = Q6_V_vand_VV(v4_5, mask_h4); // & 0x0F - HVX_Vector v5 = Q6_Vub_vlsr_VubR(v4_5, 4); // >> 4 - HVX_Vector v6 = Q6_V_vand_VV(v6_7, mask_h4); // & 0x0F - HVX_Vector v7 = Q6_Vub_vlsr_VubR(v6_7, 4); // >> 4 - - v0 = Q6_Vb_vlut32_VbVbI(v0, lut, 0); - v1 = Q6_Vb_vlut32_VbVbI(v1, lut, 0); - v2 = Q6_Vb_vlut32_VbVbI(v2, lut, 0); - v3 = Q6_Vb_vlut32_VbVbI(v3, lut, 0); - v4 = Q6_Vb_vlut32_VbVbI(v4, lut, 0); - v5 = Q6_Vb_vlut32_VbVbI(v5, lut, 0); - v6 = Q6_Vb_vlut32_VbVbI(v6, lut, 0); - v7 = Q6_Vb_vlut32_VbVbI(v7, lut, 0); - - HVX_Vector_x8 r = { v0, v1, v2, v3, v4, v5, v6, v7 }; - return r; -} - -static inline HVX_Vector_x8 hvx_vec_load_iq4nlx4x8_partial(const uint8_t * restrict ptr, uint32_t n) { - const HVX_Vector * restrict vptr = (const HVX_Vector *) ptr; - - const uint32_t qk = QK_Q4_0x4x2; // 256 - const uint32_t nb = n / qk; - const uint32_t nloe = n % qk; - - const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F); - const HVX_Vector lut = *(const HVX_Vector *) kvalues_iq4nl_lut; - - HVX_Vector_x8 r; - uint32_t i = 0; - - #pragma unroll(2) - for (i = 0; i < nb; i++) { - HVX_Vector v = vptr[i]; // 256 elements (128 bytes) - HVX_Vector v0 = Q6_V_vand_VV(v, mask_h4); // & 0x0F : first 128 elements - HVX_Vector v1 = Q6_Vub_vlsr_VubR(v, 4); // >> 4 : second 128 elements - r.v[i * 2 + 0] = Q6_Vb_vlut32_VbVbI(v0, lut, 0); - r.v[i * 2 + 1] = Q6_Vb_vlut32_VbVbI(v1, lut, 0); - } - - if (nloe) { - HVX_Vector v = vptr[i]; // 256 elements (128 bytes) - HVX_Vector v0 = Q6_V_vand_VV(v, mask_h4); // & 0x0F : even 128 elements - HVX_Vector v1 = Q6_Vub_vlsr_VubR(v, 4); // >> 4 : odd 128 elements - HVX_VectorPair v0_1_p = Q6_W_vshuff_VVR(v1, v0, -1); // zip even:odd:... - r.v[i * 2 + 0] = Q6_Vb_vlut32_VbVbI(Q6_V_lo_W(v0_1_p), lut, 0); - r.v[i * 2 + 1] = Q6_Vb_vlut32_VbVbI(Q6_V_hi_W(v0_1_p), lut, 0); - } - - return r; -} - -// q4x4x2 and q8x4x2 are the flat q4/8_0 formats where all quants are stored first followed by all scales - -static inline size_t q8x4x2_row_size(uint32_t ne) { - // ensures perfect alignment of quants and full row - const uint32_t qk = QK_Q8_0x4x2; - const uint32_t nb = (ne + qk - 1) / qk; - return hex_round_up(ne + nb * 8 * sizeof(__fp16), 128); -} - -static inline size_t q8_1x4x2_row_size(uint32_t ne) { - // ensures perfect alignment of quants and full row - const uint32_t qk = QK_Q8_0x4x2; - const uint32_t nb = (ne + qk - 1) / qk; - return hex_round_up(ne + nb * 8 * 2 * sizeof(__fp16), 128); -} - -static inline HVX_Vector_x8 hvx_vec_load_q4x4x8_full(const uint8_t * restrict ptr) { - const HVX_Vector * restrict vptr = (const HVX_Vector *) ptr; - - HVX_Vector v0_1 = vptr[0]; // first 256 elements (128 bytes) - HVX_Vector v2_3 = vptr[1]; // ... - HVX_Vector v4_5 = vptr[2]; // ... - HVX_Vector v6_7 = vptr[3]; // ... - - const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F); - const HVX_Vector i8 = Q6_Vb_vsplat_R(8); - - HVX_Vector v0 = Q6_V_vand_VV(v0_1, mask_h4); // & 0x0F : first 128 elements - HVX_Vector v1 = Q6_Vub_vlsr_VubR(v0_1, 4); // >> 4 : second 128 elements - HVX_Vector v2 = Q6_V_vand_VV(v2_3, mask_h4); // & 0x0F ... - HVX_Vector v3 = Q6_Vub_vlsr_VubR(v2_3, 4); // >> 4 - HVX_Vector v4 = Q6_V_vand_VV(v4_5, mask_h4); // & 0x0F - HVX_Vector v5 = Q6_Vub_vlsr_VubR(v4_5, 4); // >> 4 - HVX_Vector v6 = Q6_V_vand_VV(v6_7, mask_h4); // & 0x0F - HVX_Vector v7 = Q6_Vub_vlsr_VubR(v6_7, 4); // >> 4 - - // Convert uint4 to int4 (i.e. x - 8) - v0 = Q6_Vb_vsub_VbVb(v0, i8); - v1 = Q6_Vb_vsub_VbVb(v1, i8); - v2 = Q6_Vb_vsub_VbVb(v2, i8); - v3 = Q6_Vb_vsub_VbVb(v3, i8); - v4 = Q6_Vb_vsub_VbVb(v4, i8); - v5 = Q6_Vb_vsub_VbVb(v5, i8); - v6 = Q6_Vb_vsub_VbVb(v6, i8); - v7 = Q6_Vb_vsub_VbVb(v7, i8); - - HVX_Vector_x8 r = { v0, v1, v2, v3, v4, v5, v6, v7 }; - return r; -} - -static HVX_Vector_x8 hvx_vec_load_q4x4x8_partial(const uint8_t * restrict ptr, uint32_t n) { - const HVX_Vector * restrict vptr = (const HVX_Vector *) ptr; - - const uint32_t qk = QK_Q4_0x4x2; // 256 - const uint32_t nb = n / qk; - const uint32_t nloe = n % qk; - - const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F); - const HVX_Vector i8 = Q6_Vb_vsplat_R(8); - - HVX_Vector_x8 r; - uint32_t i = 0; - - #pragma unroll(2) - for (i=0; i < nb; i++) { - HVX_Vector v = vptr[i]; // 256 elements (128 bytes) - HVX_Vector v0 = Q6_V_vand_VV(v, mask_h4); // & 0x0F : first 128 elements - HVX_Vector v1 = Q6_Vub_vlsr_VubR(v, 4); // >> 4 : second 128 elements - r.v[i*2+0] = Q6_Vb_vsub_VbVb(v0, i8); - r.v[i*2+1] = Q6_Vb_vsub_VbVb(v1, i8); - } - - if (nloe) { - HVX_Vector v = vptr[i]; // 256 elements (128 bytes) - HVX_Vector v0 = Q6_V_vand_VV(v, mask_h4); // & 0x0F : even 128 elements - HVX_Vector v1 = Q6_Vub_vlsr_VubR(v, 4); // >> 4 : odd 128 elements - HVX_VectorPair v0_1_p = Q6_W_vshuff_VVR(v1, v0, -1); // zip even:odd:... - r.v[i*2+0] = Q6_Vb_vsub_VbVb(Q6_V_lo_W(v0_1_p), i8); - r.v[i*2+1] = Q6_Vb_vsub_VbVb(Q6_V_hi_W(v0_1_p), i8); - } - - return r; -} - -static inline HVX_Vector_x8 hvx_vec_load_q4_1x4x8_full(const uint8_t * restrict ptr) { - const HVX_Vector * restrict vptr = (const HVX_Vector *) ptr; - - HVX_Vector v0_1 = vptr[0]; // first 256 elements (128 bytes) - HVX_Vector v2_3 = vptr[1]; // ... - HVX_Vector v4_5 = vptr[2]; // ... - HVX_Vector v6_7 = vptr[3]; // ... - - const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F); - - HVX_Vector v0 = Q6_V_vand_VV(v0_1, mask_h4); // & 0x0F : first 128 elements - HVX_Vector v1 = Q6_Vub_vlsr_VubR(v0_1, 4); // >> 4 : second 128 elements - HVX_Vector v2 = Q6_V_vand_VV(v2_3, mask_h4); // & 0x0F ... - HVX_Vector v3 = Q6_Vub_vlsr_VubR(v2_3, 4); // >> 4 - HVX_Vector v4 = Q6_V_vand_VV(v4_5, mask_h4); // & 0x0F - HVX_Vector v5 = Q6_Vub_vlsr_VubR(v4_5, 4); // >> 4 - HVX_Vector v6 = Q6_V_vand_VV(v6_7, mask_h4); // & 0x0F - HVX_Vector v7 = Q6_Vub_vlsr_VubR(v6_7, 4); // >> 4 - - HVX_Vector_x8 r = { v0, v1, v2, v3, v4, v5, v6, v7 }; - return r; -} - -static HVX_Vector_x8 hvx_vec_load_q4_1x4x8_partial(const uint8_t * restrict ptr, uint32_t n) { - const HVX_Vector * restrict vptr = (const HVX_Vector *) ptr; - - const uint32_t qk = QK_Q4_0x4x2; // 256 - const uint32_t nb = n / qk; - const uint32_t nloe = n % qk; - - const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F); - - HVX_Vector_x8 r; - uint32_t i = 0; - - #pragma unroll(2) - for (i=0; i < nb; i++) { - HVX_Vector v = vptr[i]; // 256 elements (128 bytes) - HVX_Vector v0 = Q6_V_vand_VV(v, mask_h4); // & 0x0F : first 128 elements - HVX_Vector v1 = Q6_Vub_vlsr_VubR(v, 4); // >> 4 : second 128 elements - r.v[i*2+0] = v0; - r.v[i*2+1] = v1; - } - - if (nloe) { - HVX_Vector v = vptr[i]; // 256 elements (128 bytes) - HVX_Vector v0 = Q6_V_vand_VV(v, mask_h4); // & 0x0F : even 128 elements - HVX_Vector v1 = Q6_Vub_vlsr_VubR(v, 4); // >> 4 : odd 128 elements - HVX_VectorPair v0_1_p = Q6_W_vshuff_VVR(v1, v0, -1); // zip even:odd:... - r.v[i*2+0] = Q6_V_lo_W(v0_1_p); - r.v[i*2+1] = Q6_V_hi_W(v0_1_p); - } - - return r; -} - -static inline HVX_Vector_x8 hvx_vec_load_mxfp4x4x8_full(const uint8_t * restrict ptr) { - const HVX_Vector * restrict vptr = (const HVX_Vector *) ptr; - - HVX_Vector v0_1 = vptr[0]; // first 256 elements (128 bytes) - HVX_Vector v2_3 = vptr[1]; // ... - HVX_Vector v4_5 = vptr[2]; // ... - HVX_Vector v6_7 = vptr[3]; // ... - - const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F); - const HVX_Vector lut = *(const HVX_Vector *) kvalues_mxfp4_lut; - - HVX_Vector v0 = Q6_V_vand_VV(v0_1, mask_h4); // & 0x0F - HVX_Vector v1 = Q6_Vub_vlsr_VubR(v0_1, 4); // >> 4 - HVX_Vector v2 = Q6_V_vand_VV(v2_3, mask_h4); // & 0x0F - HVX_Vector v3 = Q6_Vub_vlsr_VubR(v2_3, 4); // >> 4 - HVX_Vector v4 = Q6_V_vand_VV(v4_5, mask_h4); // & 0x0F - HVX_Vector v5 = Q6_Vub_vlsr_VubR(v4_5, 4); // >> 4 - HVX_Vector v6 = Q6_V_vand_VV(v6_7, mask_h4); // & 0x0F - HVX_Vector v7 = Q6_Vub_vlsr_VubR(v6_7, 4); // >> 4 - - v0 = Q6_Vb_vlut32_VbVbI(v0, lut, 0); - v1 = Q6_Vb_vlut32_VbVbI(v1, lut, 0); - v2 = Q6_Vb_vlut32_VbVbI(v2, lut, 0); - v3 = Q6_Vb_vlut32_VbVbI(v3, lut, 0); - v4 = Q6_Vb_vlut32_VbVbI(v4, lut, 0); - v5 = Q6_Vb_vlut32_VbVbI(v5, lut, 0); - v6 = Q6_Vb_vlut32_VbVbI(v6, lut, 0); - v7 = Q6_Vb_vlut32_VbVbI(v7, lut, 0); - - HVX_Vector_x8 r = { v0, v1, v2, v3, v4, v5, v6, v7 }; - return r; -} - -static inline HVX_Vector_x8 hvx_vec_load_mxfp4x4x8_partial(const uint8_t * restrict ptr, uint32_t n) { - const HVX_Vector * restrict vptr = (const HVX_Vector *) ptr; - - const uint32_t qk = QK_Q4_0x4x2; // 256 - const uint32_t nb = n / qk; - const uint32_t nloe = n % qk; - - const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F); - const HVX_Vector lut = *(const HVX_Vector *) kvalues_mxfp4_lut; - - HVX_Vector_x8 r; - uint32_t i = 0; - - #pragma unroll(2) - for (i=0; i < nb; i++) { - HVX_Vector v = vptr[i]; // 256 elements (128 bytes) - HVX_Vector v0 = Q6_V_vand_VV(v, mask_h4); // & 0x0F : first 128 elements - HVX_Vector v1 = Q6_Vub_vlsr_VubR(v, 4); // >> 4 : second 128 elements - r.v[i*2+0] = Q6_Vb_vlut32_VbVbI(v0, lut, 0); - r.v[i*2+1] = Q6_Vb_vlut32_VbVbI(v1, lut, 0); - } - - if (nloe) { - HVX_Vector v = vptr[i]; // 256 elements (128 bytes) - HVX_Vector v0 = Q6_V_vand_VV(v, mask_h4); // & 0x0F : even 128 elements - HVX_Vector v1 = Q6_Vub_vlsr_VubR(v, 4); // >> 4 : odd 128 elements - HVX_VectorPair v0_1_p = Q6_W_vshuff_VVR(v1, v0, -1); // zip even:odd:... - r.v[i*2+0] = Q6_Vb_vlut32_VbVbI(Q6_V_lo_W(v0_1_p), lut, 0); - r.v[i*2+1] = Q6_Vb_vlut32_VbVbI(Q6_V_hi_W(v0_1_p), lut, 0); - } - - return r; -} - -static inline HVX_Vector_x8 hvx_vec_load_q8x4x8_full(const uint8_t * restrict ptr) { - const HVX_Vector * restrict vptr = (const HVX_Vector *) ptr; - - HVX_Vector v0 = vptr[0]; // first 128 vals - HVX_Vector v1 = vptr[1]; // ... - HVX_Vector v2 = vptr[2]; // ... - HVX_Vector v3 = vptr[3]; // ... - HVX_Vector v4 = vptr[4]; // ... - HVX_Vector v5 = vptr[5]; // ... - HVX_Vector v6 = vptr[6]; // ... - HVX_Vector v7 = vptr[7]; // ... - - HVX_Vector_x8 r = { v0, v1, v2, v3, v4, v5, v6, v7 }; - return r; -} - -static inline HVX_Vector_x8 hvx_vec_load_q8x4x8_partial(const uint8_t * restrict ptr, uint32_t nloe) { - return hvx_vec_load_q8x4x8_full(ptr); -} - -// Reduce multiply 1024 x 1024 int8 elements (32x q4/8 blocks in 8x HVX vectors). -// Accumulate each block into a single int32 value. -// Return a single HVX vector with 32x int32 accumulators. -// This version is parameterized to support less than 1024 elements. -// if() checks are optimized out at compile time -- make sure to pass N as a constexpr. - -static inline HVX_Vector hvx_vec_rmpy_x8_n(HVX_Vector_x8 x, HVX_Vector_x8 y, unsigned int n) { - HVX_Vector r0 = Q6_V_vzero(); - HVX_Vector r1 = Q6_V_vzero(); - HVX_Vector r2 = Q6_V_vzero(); - HVX_Vector r3 = Q6_V_vzero(); - HVX_Vector r4 = Q6_V_vzero(); - HVX_Vector r5 = Q6_V_vzero(); - HVX_Vector r6 = Q6_V_vzero(); - HVX_Vector r7 = Q6_V_vzero(); - - HVX_VectorPair p3; - HVX_VectorPair p2; - HVX_VectorPair p1; - HVX_VectorPair p0; - - if (n >= 128) { r0 = Q6_Vw_vrmpy_VbVb(x.v[0], y.v[0]); } - if (n >= 256) { r1 = Q6_Vw_vrmpy_VbVb(x.v[1], y.v[1]); } - if (n >= 384) { r2 = Q6_Vw_vrmpy_VbVb(x.v[2], y.v[2]); } - if (n >= 512) { r3 = Q6_Vw_vrmpy_VbVb(x.v[3], y.v[3]); } - if (n >= 640) { r4 = Q6_Vw_vrmpy_VbVb(x.v[4], y.v[4]); } - if (n >= 768) { r5 = Q6_Vw_vrmpy_VbVb(x.v[5], y.v[5]); } - if (n >= 896) { r6 = Q6_Vw_vrmpy_VbVb(x.v[6], y.v[6]); } - if (n >= 1024) { r7 = Q6_Vw_vrmpy_VbVb(x.v[7], y.v[7]); } - - if (n >= 128) { p0 = Q6_W_vdeal_VVR(r1, r0, -4); } - if (n >= 384) { p1 = Q6_W_vdeal_VVR(r3, r2, -4); } - if (n >= 640) { p2 = Q6_W_vdeal_VVR(r5, r4, -4); } - if (n >= 896) { p3 = Q6_W_vdeal_VVR(r7, r6, -4); } - - if (n >= 128) { r0 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p0), Q6_V_hi_W(p0)); } - if (n >= 384) { r1 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p1), Q6_V_hi_W(p1)); } - if (n >= 640) { r2 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p2), Q6_V_hi_W(p2)); } - if (n >= 896) { r3 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p3), Q6_V_hi_W(p3)); } - - if (n >= 128) { p0 = Q6_W_vdeal_VVR(r1, r0, -4); } - if (n >= 640) { p1 = Q6_W_vdeal_VVR(r3, r2, -4); } - - if (n >= 128) { r0 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p0), Q6_V_hi_W(p0)); } - if (n >= 640) { r1 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p1), Q6_V_hi_W(p1)); } - - if (n >= 128) { p0 = Q6_W_vdeal_VVR(r1, r0, -4); } - if (n >= 128) { r0 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p0), Q6_V_hi_W(p0)); } - - return r0; -} - -static inline HVX_Vector hvx_vec_rmpy_x8_full(HVX_Vector_x8 x, HVX_Vector_x8 y) { - HVX_Vector r0 = Q6_Vw_vrmpy_VbVb(x.v[0], y.v[0]); - HVX_Vector r1 = Q6_Vw_vrmpy_VbVb(x.v[1], y.v[1]); - HVX_Vector r2 = Q6_Vw_vrmpy_VbVb(x.v[2], y.v[2]); - HVX_Vector r3 = Q6_Vw_vrmpy_VbVb(x.v[3], y.v[3]); - HVX_Vector r4 = Q6_Vw_vrmpy_VbVb(x.v[4], y.v[4]); - HVX_Vector r5 = Q6_Vw_vrmpy_VbVb(x.v[5], y.v[5]); - HVX_Vector r6 = Q6_Vw_vrmpy_VbVb(x.v[6], y.v[6]); - HVX_Vector r7 = Q6_Vw_vrmpy_VbVb(x.v[7], y.v[7]); - - HVX_VectorPair p0 = Q6_W_vdeal_VVR(r1, r0, -4); - HVX_VectorPair p1 = Q6_W_vdeal_VVR(r3, r2, -4); - HVX_VectorPair p2 = Q6_W_vdeal_VVR(r5, r4, -4); - HVX_VectorPair p3 = Q6_W_vdeal_VVR(r7, r6, -4); - - r0 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p0), Q6_V_hi_W(p0)); - r1 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p1), Q6_V_hi_W(p1)); - r2 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p2), Q6_V_hi_W(p2)); - r3 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p3), Q6_V_hi_W(p3)); - - p0 = Q6_W_vdeal_VVR(r1, r0, -4); - p1 = Q6_W_vdeal_VVR(r3, r2, -4); - - r0 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p0), Q6_V_hi_W(p0)); - r1 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p1), Q6_V_hi_W(p1)); - - p0 = Q6_W_vdeal_VVR(r1, r0, -4); - r0 = Q6_Vw_vadd_VwVw(Q6_V_lo_W(p0), Q6_V_hi_W(p0)); - - return r0; -} - -static inline HVX_Vector hvx_vec_rmpy_x8_partial(HVX_Vector_x8 x, HVX_Vector_x8 y, unsigned int n) { - if (n >= 512) - return hvx_vec_rmpy_x8_full(x, y); - - return hvx_vec_rmpy_x8_partial(x, y, 512); -} - -static void vec_dot_q4_1x4x2_q8x4x2_1x1(const int n, float * restrict s0, const void * restrict vx0, const void * restrict vy0) { - assert(n % 32 == 0); // min sub-block size - assert((unsigned long) vx0 % 128 == 0); - assert((unsigned long) vy0 % 128 == 0); - - const uint32_t qk = QK_Q4_0x4x2 * 4; - - const uint32_t x_dblk_size = 8 * 4 * 2 * 2; // 32x (d, m) __fp16 = 128 bytes - const uint32_t x_qblk_size = qk / 2; // int4 - const uint32_t x_qrow_size = n / 2; // int4 (not padded) - - const uint32_t y_dblk_size = 8 * 4 * 4; // 32x (d, s) __fp16 = 128 bytes - const uint32_t y_qblk_size = qk; // int8 - const uint32_t y_qrow_size = n; // int8 (not padded) - - const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0 + 0); // quants first - const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0 + x_qrow_size); // then scales/offsets - - const uint8_t * restrict y_q = ((const uint8_t *) vy0 + 0); // quants first - const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales/sums - - // Row sum (sf) - HVX_Vector r0_sum = Q6_V_vzero(); - - const uint32_t nb = n / qk; // num full blocks - const uint32_t nloe = n % qk; // num leftover elemements - - uint32_t i = 0; - for (; i < nb; i++) { - HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q + i * y_qblk_size); - HVX_Vector_x8 r0_q = hvx_vec_load_q4_1x4x8_full(r0_x_q + i * x_qblk_size); - - HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q)); - - HVX_Vector ds = *(const HVX_UVector *) (y_d + i * y_dblk_size); - HVX_VectorPair ds_deal = Q6_W_vdeal_VVR(ds, ds, -2); - HVX_Vector vy_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(ds_deal)); - HVX_Vector vy_s = Q6_Vh_vshuff_Vh(Q6_V_hi_W(ds_deal)); - - HVX_Vector dm = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size); - HVX_VectorPair dm_deal = Q6_W_vdeal_VVR(dm, dm, -2); - HVX_Vector r0_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(dm_deal)); - HVX_Vector r0_m = Q6_Vh_vshuff_Vh(Q6_V_hi_W(dm_deal)); - - HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d))); - HVX_Vector r0_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_m, vy_s))); - - HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); - HVX_Vector r0_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_ms); - - r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa_total, r0_sum)); - } - - // Process leftovers - if (nloe) { - HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q + i * y_qblk_size, nloe); - HVX_Vector_x8 r0_q = hvx_vec_load_q4_1x4x8_partial(r0_x_q + i * x_qblk_size, nloe); - - HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe)); - - HVX_Vector ds = *(const HVX_UVector *) (y_d + i * y_dblk_size); - HVX_VectorPair ds_deal = Q6_W_vdeal_VVR(ds, ds, -2); - HVX_Vector vy_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(ds_deal)); - HVX_Vector vy_s = Q6_Vh_vshuff_Vh(Q6_V_hi_W(ds_deal)); - - HVX_Vector dm = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size); - HVX_VectorPair dm_deal = Q6_W_vdeal_VVR(dm, dm, -2); - HVX_Vector r0_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(dm_deal)); - HVX_Vector r0_m = Q6_Vh_vshuff_Vh(Q6_V_hi_W(dm_deal)); - - HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d))); - HVX_Vector r0_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_m, vy_s))); - - // Zero out unused elements - HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8); - r0_dd = Q6_V_vand_QV(bmask, r0_dd); - r0_ms = Q6_V_vand_QV(bmask, r0_ms); - r0_ia = Q6_V_vand_QV(bmask, r0_ia); - - HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); - HVX_Vector r0_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_ms); - - r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa_total, r0_sum)); - } - - r0_sum = hvx_vec_reduce_sum_f32(r0_sum); - hvx_vec_store_u(s0, 4, r0_sum); -} - -static void vec_dot_q4_1x4x2_q8x4x2_2x1(const int n, float * restrict s0, - const void * restrict vx0, const void * restrict vx1, - const void * restrict vy0) { - assert(n % 32 == 0); // min sub-block size - assert((unsigned long) vx0 % 128 == 0); - assert((unsigned long) vx1 % 128 == 0); - assert((unsigned long) vy0 % 128 == 0); - - const uint32_t qk = QK_Q4_0x4x2 * 4; - - const uint32_t x_dblk_size = 8 * 4 * 2 * 2; // 32x (d, m) __fp16 = 128 bytes - const uint32_t x_qblk_size = qk / 2; // int4 - const uint32_t x_qrow_size = n / 2; // int4 (not padded) - - const uint32_t y_dblk_size = 8 * 4 * 4; // 32x (d, s) __fp16 = 128 bytes - const uint32_t y_qblk_size = qk; // int8 - const uint32_t y_qrow_size = n; // int8 (not padded) - - const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0) + 0; // quants first - const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0) + x_qrow_size; // then scales - const uint8_t * restrict r1_x_q = ((const uint8_t *) vx1) + 0; // quants first - const uint8_t * restrict r1_x_d = ((const uint8_t *) vx1) + x_qrow_size; // then scales - - const uint8_t * restrict y_q = ((const uint8_t *) vy0 + 0); // quants first - const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales/sums - - // Row sum (sf) - HVX_Vector r0_sum = Q6_V_vzero(); - HVX_Vector r1_sum = Q6_V_vzero(); - - const uint32_t nb = n / qk; // num full blocks - const uint32_t nloe = n % qk; // num leftover elemements - - uint32_t i = 0; - for (; i < nb; i++) { - HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q + i * y_qblk_size); - HVX_Vector_x8 r0_q = hvx_vec_load_q4_1x4x8_full(r0_x_q + i * x_qblk_size); - HVX_Vector_x8 r1_q = hvx_vec_load_q4_1x4x8_full(r1_x_q + i * x_qblk_size); - - HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q)); - HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q)); - - HVX_Vector ds = *(const HVX_UVector *) (y_d + i * y_dblk_size); - HVX_VectorPair ds_deal = Q6_W_vdeal_VVR(ds, ds, -2); - HVX_Vector vy_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(ds_deal)); - HVX_Vector vy_s = Q6_Vh_vshuff_Vh(Q6_V_hi_W(ds_deal)); - - HVX_Vector r0_dm = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size); - HVX_VectorPair r0_dm_deal = Q6_W_vdeal_VVR(r0_dm, r0_dm, -2); - HVX_Vector r0_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(r0_dm_deal)); - HVX_Vector r0_m = Q6_Vh_vshuff_Vh(Q6_V_hi_W(r0_dm_deal)); - - HVX_Vector r1_dm = *(const HVX_UVector *) (r1_x_d + i * x_dblk_size); - HVX_VectorPair r1_dm_deal = Q6_W_vdeal_VVR(r1_dm, r1_dm, -2); - HVX_Vector r1_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(r1_dm_deal)); - HVX_Vector r1_m = Q6_Vh_vshuff_Vh(Q6_V_hi_W(r1_dm_deal)); - - HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d))); - HVX_Vector r0_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_m, vy_s))); - - HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d))); - HVX_Vector r1_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_m, vy_s))); - - HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); - HVX_Vector r0_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_ms); - - HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd); - HVX_Vector r1_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_ms); - - r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa_total, r0_sum)); - r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa_total, r1_sum)); - } - - // Process leftovers - if (nloe) { - HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q + i * y_qblk_size, nloe); - HVX_Vector_x8 r0_q = hvx_vec_load_q4_1x4x8_partial(r0_x_q + i * x_qblk_size, nloe); - HVX_Vector_x8 r1_q = hvx_vec_load_q4_1x4x8_partial(r1_x_q + i * x_qblk_size, nloe); - - HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe)); - HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy_q, nloe)); - - HVX_Vector ds = *(const HVX_UVector *) (y_d + i * y_dblk_size); - HVX_VectorPair ds_deal = Q6_W_vdeal_VVR(ds, ds, -2); - HVX_Vector vy_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(ds_deal)); - HVX_Vector vy_s = Q6_Vh_vshuff_Vh(Q6_V_hi_W(ds_deal)); - - HVX_Vector r0_dm = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size); - HVX_VectorPair r0_dm_deal = Q6_W_vdeal_VVR(r0_dm, r0_dm, -2); - HVX_Vector r0_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(r0_dm_deal)); - HVX_Vector r0_m = Q6_Vh_vshuff_Vh(Q6_V_hi_W(r0_dm_deal)); - - HVX_Vector r1_dm = *(const HVX_UVector *) (r1_x_d + i * x_dblk_size); - HVX_VectorPair r1_dm_deal = Q6_W_vdeal_VVR(r1_dm, r1_dm, -2); - HVX_Vector r1_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(r1_dm_deal)); - HVX_Vector r1_m = Q6_Vh_vshuff_Vh(Q6_V_hi_W(r1_dm_deal)); - - HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d))); - HVX_Vector r0_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_m, vy_s))); - - HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d))); - HVX_Vector r1_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_m, vy_s))); - - // Zero out unused elements - HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8); - r0_dd = Q6_V_vand_QV(bmask, r0_dd); - r0_ms = Q6_V_vand_QV(bmask, r0_ms); - r1_dd = Q6_V_vand_QV(bmask, r1_dd); - r1_ms = Q6_V_vand_QV(bmask, r1_ms); - r0_ia = Q6_V_vand_QV(bmask, r0_ia); - r1_ia = Q6_V_vand_QV(bmask, r1_ia); - - HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); - HVX_Vector r0_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_ms); - - HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd); - HVX_Vector r1_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_ms); - - r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa_total, r0_sum)); - r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa_total, r1_sum)); - } - - HVX_Vector rsum = hvx_vec_reduce_sum_f32x2(r0_sum, r1_sum); - hvx_vec_store_u(s0, 8, rsum); -} - -static void vec_dot_q4_1x4x2_q8x4x2_4x1(const int n, float * restrict s0, - const void * restrict vx0, const void * restrict vx1, - const void * restrict vx2, const void * restrict vx3, - const void * restrict vy0) { - assert(n % 32 == 0); // min sub-block size - assert((unsigned long) vx0 % 128 == 0); - assert((unsigned long) vx1 % 128 == 0); - assert((unsigned long) vx2 % 128 == 0); - assert((unsigned long) vx3 % 128 == 0); - assert((unsigned long) vy0 % 128 == 0); - - const uint32_t qk = QK_Q4_0x4x2 * 4; - - const uint32_t x_dblk_size = 8 * 4 * 2 * 2; // 32x (d, m) __fp16 = 128 bytes - const uint32_t x_qblk_size = qk / 2; // int4 - const uint32_t x_qrow_size = n / 2; // int4 (not padded) - - const uint32_t y_dblk_size = 8 * 4 * 4; // 32x (d, s) __fp16 = 128 bytes - const uint32_t y_qblk_size = qk; // int8 - const uint32_t y_qrow_size = n; // int8 (not padded) - - const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0) + 0; // quants first - const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0) + x_qrow_size; // then scales - const uint8_t * restrict r1_x_q = ((const uint8_t *) vx1) + 0; // quants first - const uint8_t * restrict r1_x_d = ((const uint8_t *) vx1) + x_qrow_size; // then scales - const uint8_t * restrict r2_x_q = ((const uint8_t *) vx2) + 0; // quants first - const uint8_t * restrict r2_x_d = ((const uint8_t *) vx2) + x_qrow_size; // then scales - const uint8_t * restrict r3_x_q = ((const uint8_t *) vx3) + 0; // quants first - const uint8_t * restrict r3_x_d = ((const uint8_t *) vx3) + x_qrow_size; // then scales - - const uint8_t * restrict y_q = ((const uint8_t *) vy0 + 0); // quants first - const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales/sums - - // Row sum (sf) - HVX_Vector r0_sum = Q6_V_vzero(); - HVX_Vector r1_sum = Q6_V_vzero(); - HVX_Vector r2_sum = Q6_V_vzero(); - HVX_Vector r3_sum = Q6_V_vzero(); - - const uint32_t nb = n / qk; // num full blocks - const uint32_t nloe = n % qk; // num leftover elements - - uint32_t i = 0; - for (; i < nb; i++) { - HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q + i * y_qblk_size); - HVX_Vector_x8 r0_q = hvx_vec_load_q4_1x4x8_full(r0_x_q + i * x_qblk_size); - HVX_Vector_x8 r1_q = hvx_vec_load_q4_1x4x8_full(r1_x_q + i * x_qblk_size); - HVX_Vector_x8 r2_q = hvx_vec_load_q4_1x4x8_full(r2_x_q + i * x_qblk_size); - HVX_Vector_x8 r3_q = hvx_vec_load_q4_1x4x8_full(r3_x_q + i * x_qblk_size); - - HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q)); - HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q)); - HVX_Vector r2_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r2_q, vy_q)); - HVX_Vector r3_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r3_q, vy_q)); - - HVX_Vector ds = *(const HVX_UVector *) (y_d + i * y_dblk_size); - HVX_VectorPair ds_deal = Q6_W_vdeal_VVR(ds, ds, -2); - HVX_Vector vy_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(ds_deal)); - HVX_Vector vy_s = Q6_Vh_vshuff_Vh(Q6_V_hi_W(ds_deal)); - - HVX_Vector r0_dm = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size); - HVX_VectorPair r0_dm_deal = Q6_W_vdeal_VVR(r0_dm, r0_dm, -2); - HVX_Vector r0_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(r0_dm_deal)); - HVX_Vector r0_m = Q6_Vh_vshuff_Vh(Q6_V_hi_W(r0_dm_deal)); - - HVX_Vector r1_dm = *(const HVX_UVector *) (r1_x_d + i * x_dblk_size); - HVX_VectorPair r1_dm_deal = Q6_W_vdeal_VVR(r1_dm, r1_dm, -2); - HVX_Vector r1_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(r1_dm_deal)); - HVX_Vector r1_m = Q6_Vh_vshuff_Vh(Q6_V_hi_W(r1_dm_deal)); - - HVX_Vector r2_dm = *(const HVX_UVector *) (r2_x_d + i * x_dblk_size); - HVX_VectorPair r2_dm_deal = Q6_W_vdeal_VVR(r2_dm, r2_dm, -2); - HVX_Vector r2_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(r2_dm_deal)); - HVX_Vector r2_m = Q6_Vh_vshuff_Vh(Q6_V_hi_W(r2_dm_deal)); - - HVX_Vector r3_dm = *(const HVX_UVector *) (r3_x_d + i * x_dblk_size); - HVX_VectorPair r3_dm_deal = Q6_W_vdeal_VVR(r3_dm, r3_dm, -2); - HVX_Vector r3_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(r3_dm_deal)); - HVX_Vector r3_m = Q6_Vh_vshuff_Vh(Q6_V_hi_W(r3_dm_deal)); - - HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d))); - HVX_Vector r0_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_m, vy_s))); - - HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d))); - HVX_Vector r1_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_m, vy_s))); - - HVX_Vector r2_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r2_d, vy_d))); - HVX_Vector r2_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r2_m, vy_s))); - - HVX_Vector r3_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r3_d, vy_d))); - HVX_Vector r3_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r3_m, vy_s))); - - HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); - HVX_Vector r0_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_ms); - - HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd); - HVX_Vector r1_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_ms); - - HVX_Vector r2_fa = Q6_Vqf32_vmpy_VsfVsf(r2_ia, r2_dd); - HVX_Vector r2_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r2_fa, r2_ms); - - HVX_Vector r3_fa = Q6_Vqf32_vmpy_VsfVsf(r3_ia, r3_dd); - HVX_Vector r3_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r3_fa, r3_ms); - - r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa_total, r0_sum)); - r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa_total, r1_sum)); - r2_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r2_fa_total, r2_sum)); - r3_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r3_fa_total, r3_sum)); - } - - if (nloe) { - HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q + i * y_qblk_size, nloe); - HVX_Vector_x8 r0_q = hvx_vec_load_q4_1x4x8_partial(r0_x_q + i * x_qblk_size, nloe); - HVX_Vector_x8 r1_q = hvx_vec_load_q4_1x4x8_partial(r1_x_q + i * x_qblk_size, nloe); - HVX_Vector_x8 r2_q = hvx_vec_load_q4_1x4x8_partial(r2_x_q + i * x_qblk_size, nloe); - HVX_Vector_x8 r3_q = hvx_vec_load_q4_1x4x8_partial(r3_x_q + i * x_qblk_size, nloe); - - HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe)); - HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy_q, nloe)); - HVX_Vector r2_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r2_q, vy_q, nloe)); - HVX_Vector r3_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r3_q, vy_q, nloe)); - - HVX_Vector ds = *(const HVX_UVector *) (y_d + i * y_dblk_size); - HVX_VectorPair ds_deal = Q6_W_vdeal_VVR(ds, ds, -2); - HVX_Vector vy_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(ds_deal)); - HVX_Vector vy_s = Q6_Vh_vshuff_Vh(Q6_V_hi_W(ds_deal)); - - HVX_Vector r0_dm = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size); - HVX_VectorPair r0_dm_deal = Q6_W_vdeal_VVR(r0_dm, r0_dm, -2); - HVX_Vector r0_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(r0_dm_deal)); - HVX_Vector r0_m = Q6_Vh_vshuff_Vh(Q6_V_hi_W(r0_dm_deal)); - - HVX_Vector r1_dm = *(const HVX_UVector *) (r1_x_d + i * x_dblk_size); - HVX_VectorPair r1_dm_deal = Q6_W_vdeal_VVR(r1_dm, r1_dm, -2); - HVX_Vector r1_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(r1_dm_deal)); - HVX_Vector r1_m = Q6_Vh_vshuff_Vh(Q6_V_hi_W(r1_dm_deal)); - - HVX_Vector r2_dm = *(const HVX_UVector *) (r2_x_d + i * x_dblk_size); - HVX_VectorPair r2_dm_deal = Q6_W_vdeal_VVR(r2_dm, r2_dm, -2); - HVX_Vector r2_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(r2_dm_deal)); - HVX_Vector r2_m = Q6_Vh_vshuff_Vh(Q6_V_hi_W(r2_dm_deal)); - - HVX_Vector r3_dm = *(const HVX_UVector *) (r3_x_d + i * x_dblk_size); - HVX_VectorPair r3_dm_deal = Q6_W_vdeal_VVR(r3_dm, r3_dm, -2); - HVX_Vector r3_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(r3_dm_deal)); - HVX_Vector r3_m = Q6_Vh_vshuff_Vh(Q6_V_hi_W(r3_dm_deal)); - - HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d))); - HVX_Vector r0_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_m, vy_s))); - - HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d))); - HVX_Vector r1_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_m, vy_s))); - - HVX_Vector r2_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r2_d, vy_d))); - HVX_Vector r2_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r2_m, vy_s))); - - HVX_Vector r3_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r3_d, vy_d))); - HVX_Vector r3_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r3_m, vy_s))); - - HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8); - r0_dd = Q6_V_vand_QV(bmask, r0_dd); - r0_ms = Q6_V_vand_QV(bmask, r0_ms); - r1_dd = Q6_V_vand_QV(bmask, r1_dd); - r1_ms = Q6_V_vand_QV(bmask, r1_ms); - r2_dd = Q6_V_vand_QV(bmask, r2_dd); - r2_ms = Q6_V_vand_QV(bmask, r2_ms); - r3_dd = Q6_V_vand_QV(bmask, r3_dd); - r3_ms = Q6_V_vand_QV(bmask, r3_ms); - r0_ia = Q6_V_vand_QV(bmask, r0_ia); - r1_ia = Q6_V_vand_QV(bmask, r1_ia); - r2_ia = Q6_V_vand_QV(bmask, r2_ia); - r3_ia = Q6_V_vand_QV(bmask, r3_ia); - - HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); - HVX_Vector r0_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_ms); - - HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd); - HVX_Vector r1_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_ms); - - HVX_Vector r2_fa = Q6_Vqf32_vmpy_VsfVsf(r2_ia, r2_dd); - HVX_Vector r2_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r2_fa, r2_ms); - - HVX_Vector r3_fa = Q6_Vqf32_vmpy_VsfVsf(r3_ia, r3_dd); - HVX_Vector r3_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r3_fa, r3_ms); - - r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa_total, r0_sum)); - r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa_total, r1_sum)); - r2_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r2_fa_total, r2_sum)); - r3_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r3_fa_total, r3_sum)); - } - - HVX_Vector_x4 rsum_in = { .v = { r0_sum, r1_sum, r2_sum, r3_sum } }; - HVX_Vector rsum = hvx_vec_reduce_sum_f32x4(rsum_in); - hvx_vec_store_u(s0, 16, rsum); -} - - -static void vec_dot_q4_1x4x2_q8x4x2_2x2(const int n, float * restrict s0, float * restrict s1, - const void * restrict vx0, const void * restrict vx1, - const void * restrict vy0, const void * restrict vy1) { - assert(n % 32 == 0); - assert((unsigned long) vx0 % 128 == 0); - assert((unsigned long) vx1 % 128 == 0); - assert((unsigned long) vy0 % 128 == 0); - assert((unsigned long) vy1 % 128 == 0); - - const uint32_t qk = QK_Q4_0x4x2 * 4; - - const uint32_t x_dblk_size = 8 * 4 * 2 * 2; // 32x (d, m) __fp16 = 128 bytes - const uint32_t x_qblk_size = qk / 2; // int4 - const uint32_t x_qrow_size = n / 2; // int4 (not padded) - - const uint32_t y_dblk_size = 8 * 4 * 4; // 32x (d, s) __fp16 = 128 bytes - const uint32_t y_qblk_size = qk; // int8 - const uint32_t y_qrow_size = n; // int8 (not padded) - - const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0) + 0; // quants first - const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0) + x_qrow_size; // then scales - const uint8_t * restrict r1_x_q = ((const uint8_t *) vx1) + 0; // quants first - const uint8_t * restrict r1_x_d = ((const uint8_t *) vx1) + x_qrow_size; // then scales - - const uint8_t * restrict y0_q = ((const uint8_t *) vy0) + 0; // quants first - const uint8_t * restrict y0_d = ((const uint8_t *) vy0) + y_qrow_size; // then scales/sums - const uint8_t * restrict y1_q = ((const uint8_t *) vy1) + 0; // quants first - const uint8_t * restrict y1_d = ((const uint8_t *) vy1) + y_qrow_size; // then scales/sums - - // Row sums (sf) - 4 accumulators for 2×2 tile - HVX_Vector r0_c0_sum = Q6_V_vzero(); - HVX_Vector r0_c1_sum = Q6_V_vzero(); - HVX_Vector r1_c0_sum = Q6_V_vzero(); - HVX_Vector r1_c1_sum = Q6_V_vzero(); - - const uint32_t nb = n / qk; // num full blocks - const uint32_t nloe = n % qk; // num leftover elements - - uint32_t i = 0; - for (; i < nb; i++) { - // Load src1 columns - HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8_full(y0_q + i * y_qblk_size); - HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8_full(y1_q + i * y_qblk_size); - - // Load src0 rows - HVX_Vector_x8 r0_q = hvx_vec_load_q4_1x4x8_full(r0_x_q + i * x_qblk_size); - HVX_Vector_x8 r1_q = hvx_vec_load_q4_1x4x8_full(r1_x_q + i * x_qblk_size); - - // Compute 4 dot products: r0×c0, r0×c1, r1×c0, r1×c1 - HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy0_q)); - HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy1_q)); - HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy0_q)); - HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy1_q)); - - // Load scales - HVX_Vector ds0 = *(const HVX_UVector *) (y0_d + i * y_dblk_size); - HVX_VectorPair ds0_deal = Q6_W_vdeal_VVR(ds0, ds0, -2); - HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(ds0_deal)); - HVX_Vector vy0_s = Q6_Vh_vshuff_Vh(Q6_V_hi_W(ds0_deal)); - - HVX_Vector ds1 = *(const HVX_UVector *) (y1_d + i * y_dblk_size); - HVX_VectorPair ds1_deal = Q6_W_vdeal_VVR(ds1, ds1, -2); - HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(ds1_deal)); - HVX_Vector vy1_s = Q6_Vh_vshuff_Vh(Q6_V_hi_W(ds1_deal)); - - HVX_Vector r0_dm = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size); - HVX_VectorPair r0_dm_deal = Q6_W_vdeal_VVR(r0_dm, r0_dm, -2); - HVX_Vector r0_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(r0_dm_deal)); - HVX_Vector r0_m = Q6_Vh_vshuff_Vh(Q6_V_hi_W(r0_dm_deal)); - - HVX_Vector r1_dm = *(const HVX_UVector *) (r1_x_d + i * x_dblk_size); - HVX_VectorPair r1_dm_deal = Q6_W_vdeal_VVR(r1_dm, r1_dm, -2); - HVX_Vector r1_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(r1_dm_deal)); - HVX_Vector r1_m = Q6_Vh_vshuff_Vh(Q6_V_hi_W(r1_dm_deal)); - - // Compute combined scales - HVX_Vector r0_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy0_d))); - HVX_Vector r0_c0_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_m, vy0_s))); - - HVX_Vector r0_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy1_d))); - HVX_Vector r0_c1_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_m, vy1_s))); - - HVX_Vector r1_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy0_d))); - HVX_Vector r1_c0_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_m, vy0_s))); - - HVX_Vector r1_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy1_d))); - HVX_Vector r1_c1_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_m, vy1_s))); - - // Apply scales and accumulate - HVX_Vector r0_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c0_ia, r0_c0_dd); - HVX_Vector r0_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c1_ia, r0_c1_dd); - HVX_Vector r1_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c0_ia, r1_c0_dd); - HVX_Vector r1_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c1_ia, r1_c1_dd); - - HVX_Vector r0_c0_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r0_c0_fa, r0_c0_ms); - HVX_Vector r0_c1_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r0_c1_fa, r0_c1_ms); - HVX_Vector r1_c0_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r1_c0_fa, r1_c0_ms); - HVX_Vector r1_c1_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r1_c1_fa, r1_c1_ms); - - r0_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c0_fa_total, r0_c0_sum)); - r0_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c1_fa_total, r0_c1_sum)); - r1_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c0_fa_total, r1_c0_sum)); - r1_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c1_fa_total, r1_c1_sum)); - } - - // Process leftovers - if (nloe) { - HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8_partial(y0_q + i * y_qblk_size, nloe); - HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8_partial(y1_q + i * y_qblk_size, nloe); - HVX_Vector_x8 r0_q = hvx_vec_load_q4_1x4x8_partial(r0_x_q + i * x_qblk_size, nloe); - HVX_Vector_x8 r1_q = hvx_vec_load_q4_1x4x8_partial(r1_x_q + i * x_qblk_size, nloe); - - HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy0_q, nloe)); - HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy1_q, nloe)); - HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy0_q, nloe)); - HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy1_q, nloe)); - - HVX_Vector ds0 = *(const HVX_UVector *) (y0_d + i * y_dblk_size); - HVX_VectorPair ds0_deal = Q6_W_vdeal_VVR(ds0, ds0, -2); - HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(ds0_deal)); - HVX_Vector vy0_s = Q6_Vh_vshuff_Vh(Q6_V_hi_W(ds0_deal)); - - HVX_Vector ds1 = *(const HVX_UVector *) (y1_d + i * y_dblk_size); - HVX_VectorPair ds1_deal = Q6_W_vdeal_VVR(ds1, ds1, -2); - HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(ds1_deal)); - HVX_Vector vy1_s = Q6_Vh_vshuff_Vh(Q6_V_hi_W(ds1_deal)); - - HVX_Vector r0_dm = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size); - HVX_VectorPair r0_dm_deal = Q6_W_vdeal_VVR(r0_dm, r0_dm, -2); - HVX_Vector r0_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(r0_dm_deal)); - HVX_Vector r0_m = Q6_Vh_vshuff_Vh(Q6_V_hi_W(r0_dm_deal)); - - HVX_Vector r1_dm = *(const HVX_UVector *) (r1_x_d + i * x_dblk_size); - HVX_VectorPair r1_dm_deal = Q6_W_vdeal_VVR(r1_dm, r1_dm, -2); - HVX_Vector r1_d = Q6_Vh_vshuff_Vh(Q6_V_lo_W(r1_dm_deal)); - HVX_Vector r1_m = Q6_Vh_vshuff_Vh(Q6_V_hi_W(r1_dm_deal)); - - HVX_Vector r0_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy0_d))); - HVX_Vector r0_c0_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_m, vy0_s))); - - HVX_Vector r0_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy1_d))); - HVX_Vector r0_c1_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_m, vy1_s))); - - HVX_Vector r1_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy0_d))); - HVX_Vector r1_c0_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_m, vy0_s))); - - HVX_Vector r1_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy1_d))); - HVX_Vector r1_c1_ms = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_m, vy1_s))); - - // Zero out unused elements - HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8); - r0_c0_dd = Q6_V_vand_QV(bmask, r0_c0_dd); - r0_c0_ms = Q6_V_vand_QV(bmask, r0_c0_ms); - r0_c1_dd = Q6_V_vand_QV(bmask, r0_c1_dd); - r0_c1_ms = Q6_V_vand_QV(bmask, r0_c1_ms); - r1_c0_dd = Q6_V_vand_QV(bmask, r1_c0_dd); - r1_c0_ms = Q6_V_vand_QV(bmask, r1_c0_ms); - r1_c1_dd = Q6_V_vand_QV(bmask, r1_c1_dd); - r1_c1_ms = Q6_V_vand_QV(bmask, r1_c1_ms); - - r0_c0_ia = Q6_V_vand_QV(bmask, r0_c0_ia); - r0_c1_ia = Q6_V_vand_QV(bmask, r0_c1_ia); - r1_c0_ia = Q6_V_vand_QV(bmask, r1_c0_ia); - r1_c1_ia = Q6_V_vand_QV(bmask, r1_c1_ia); - - HVX_Vector r0_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c0_ia, r0_c0_dd); - HVX_Vector r0_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c1_ia, r0_c1_dd); - HVX_Vector r1_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c0_ia, r1_c0_dd); - HVX_Vector r1_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c1_ia, r1_c1_dd); - - HVX_Vector r0_c0_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r0_c0_fa, r0_c0_ms); - HVX_Vector r0_c1_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r0_c1_fa, r0_c1_ms); - HVX_Vector r1_c0_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r1_c0_fa, r1_c0_ms); - HVX_Vector r1_c1_fa_total = Q6_Vqf32_vadd_Vqf32Vsf(r1_c1_fa, r1_c1_ms); - - r0_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c0_fa_total, r0_c0_sum)); - r0_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c1_fa_total, r0_c1_sum)); - r1_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c0_fa_total, r1_c0_sum)); - r1_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c1_fa_total, r1_c1_sum)); - } - - // Reduce and store results - HVX_Vector r0_r1_c0_sum = hvx_vec_reduce_sum_f32x2(r0_c0_sum, r1_c0_sum); - HVX_Vector r0_r1_c1_sum = hvx_vec_reduce_sum_f32x2(r0_c1_sum, r1_c1_sum); - - hvx_vec_store_u(s0, 8, r0_r1_c0_sum); // row0,col0 row1,col0 - hvx_vec_store_u(s1, 8, r0_r1_c1_sum); // row0,col1 row1,col1 -} - -static void vec_dot_q4x4x2_q8x4x2_1x1(const int n, float * restrict s0, const void * restrict vx0, const void * restrict vy0) { - assert(n % 32 == 0); // min sub-block size - assert((unsigned long) vx0 % 128 == 0); - assert((unsigned long) vy0 % 128 == 0); - - const uint32_t qk = QK_Q4_0x4x2 * 4; - - const uint32_t x_dblk_size = 8 * 4 * 2; // 32x __fp16 - const uint32_t x_qblk_size = qk / 2; // int4 - const uint32_t x_qrow_size = n / 2; // int4 (not padded) - - const uint32_t y_dblk_size = 8 * 4 * 2; // 32x __fp16 - const uint32_t y_qblk_size = qk; // int8 - const uint32_t y_qrow_size = n; // int8 (not padded) - - const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0 + 0); // quants first - const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0 + x_qrow_size); // then scales - - const uint8_t * restrict y_q = ((const uint8_t *) vy0 + 0); // quants first - const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales - - // Row sum (sf) - HVX_Vector r0_sum = Q6_V_vzero(); - - // Multiply and accumulate into int32. - // Compute combined scale (fp32). - // Apply scale to acc and accumulate into the row sum (qf32). - - const uint32_t nb = n / qk; // num full blocks - const uint32_t nloe = n % qk; // num leftover elemements - - uint32_t i = 0; - for (; i < nb; i++) { - HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q + i * y_qblk_size); - HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_full(r0_x_q + i * x_qblk_size); - - HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q)); - - HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size)); - HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); - - HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d))); - - HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); - - r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); - } - - // Process leftovers - if (nloe) { - HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q + i * y_qblk_size, nloe); - HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_partial(r0_x_q + i * x_qblk_size, nloe); - - HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe)); - - HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size)); - HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); - - HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d))); - - // Zero out unused elements - HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8); - r0_dd = Q6_V_vand_QV(bmask, r0_dd); - r0_ia = Q6_V_vand_QV(bmask, r0_ia); - - HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); - - r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); - } - - r0_sum = hvx_vec_reduce_sum_f32(r0_sum); - - hvx_vec_store_u(s0, 4, r0_sum); -} - -static void vec_dot_q4x4x2_q8x4x2_2x1(const int n, float * restrict s0, - const void * restrict vx0, const void * restrict vx1, - const void * restrict vy0) { - assert(n % 32 == 0); // min sub-block size - assert((unsigned long) vx0 % 128 == 0); - assert((unsigned long) vx1 % 128 == 0); - assert((unsigned long) vy0 % 128 == 0); - - const uint32_t qk = QK_Q4_0x4x2 * 4; - - const uint32_t x_dblk_size = 8 * 4 * 2; // 32x __fp16 - const uint32_t x_qblk_size = qk / 2; // int4 - const uint32_t x_qrow_size = n / 2; // int4 (not padded) - - const uint32_t y_dblk_size = 8 * 4 * 2; // 32x __fp16 - const uint32_t y_qblk_size = qk; // int8 - const uint32_t y_qrow_size = n; // int8 (not padded) - - const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0) + 0; // quants first - const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0) + x_qrow_size; // then scales - const uint8_t * restrict r1_x_q = ((const uint8_t *) vx1) + 0; // quants first - const uint8_t * restrict r1_x_d = ((const uint8_t *) vx1) + x_qrow_size; // then scales - - const uint8_t * restrict y_q = ((const uint8_t *) vy0 + 0); // quants first - const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales - - // Row sum (sf) - HVX_Vector r0_sum = Q6_V_vzero(); - HVX_Vector r1_sum = Q6_V_vzero(); - - // Multiply and accumulate into int32. - // Compute combined scale (fp32). - // Apply scale to acc and accumulate into the row sum (qf32). - - const uint32_t nb = n / qk; // num full blocks - const uint32_t nloe = n % qk; // num leftover elemements - - uint32_t i = 0; - for (; i < nb; i++) { - HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q + i * y_qblk_size); - HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_full(r0_x_q + i * x_qblk_size); - HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8_full(r1_x_q + i * x_qblk_size); - - HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q)); - HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q)); - - HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size)); - HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); - HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size)); - - HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d))); - HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d))); - - HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); - HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd); - - r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); - r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum)); - } - - // Process leftovers - if (nloe) { - HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q + i * y_qblk_size, nloe); - HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_partial(r0_x_q + i * x_qblk_size, nloe); - HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8_partial(r1_x_q + i * x_qblk_size, nloe); - - HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe)); - HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy_q, nloe)); - - HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size)); - HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); - HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size)); - - HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d))); - HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d))); - - // Zero out unused elements - HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8); - r0_dd = Q6_V_vand_QV(bmask, r0_dd); - r1_dd = Q6_V_vand_QV(bmask, r1_dd); - r0_ia = Q6_V_vand_QV(bmask, r0_ia); - r1_ia = Q6_V_vand_QV(bmask, r1_ia); - - HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); - HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd); - - r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); - r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum)); - } - - HVX_Vector rsum = hvx_vec_reduce_sum_f32x2(r0_sum, r1_sum); - hvx_vec_store_u(s0, 8, rsum); -} - -static void vec_dot_q4x4x2_q8x4x2_4x1(const int n, float * restrict s0, - const void * restrict vx0, const void * restrict vx1, - const void * restrict vx2, const void * restrict vx3, - const void * restrict vy0) { - assert(n % 32 == 0); // min sub-block size - assert((unsigned long) vx0 % 128 == 0); - assert((unsigned long) vx1 % 128 == 0); - assert((unsigned long) vx2 % 128 == 0); - assert((unsigned long) vx3 % 128 == 0); - assert((unsigned long) vy0 % 128 == 0); - - const uint32_t qk = QK_Q4_0x4x2 * 4; - - const uint32_t x_dblk_size = 8 * 4 * 2; // 32x __fp16 - const uint32_t x_qblk_size = qk / 2; // int4 - const uint32_t x_qrow_size = n / 2; // int4 (not padded) - - const uint32_t y_dblk_size = 8 * 4 * 2; // 32x __fp16 - const uint32_t y_qblk_size = qk; // int8 - const uint32_t y_qrow_size = n; // int8 (not padded) - - const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0) + 0; - const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0) + x_qrow_size; - const uint8_t * restrict r1_x_q = ((const uint8_t *) vx1) + 0; - const uint8_t * restrict r1_x_d = ((const uint8_t *) vx1) + x_qrow_size; - const uint8_t * restrict r2_x_q = ((const uint8_t *) vx2) + 0; - const uint8_t * restrict r2_x_d = ((const uint8_t *) vx2) + x_qrow_size; - const uint8_t * restrict r3_x_q = ((const uint8_t *) vx3) + 0; - const uint8_t * restrict r3_x_d = ((const uint8_t *) vx3) + x_qrow_size; - - const uint8_t * restrict y_q = ((const uint8_t *) vy0 + 0); - const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); - - // Row sum (sf) - HVX_Vector r0_sum = Q6_V_vzero(); - HVX_Vector r1_sum = Q6_V_vzero(); - HVX_Vector r2_sum = Q6_V_vzero(); - HVX_Vector r3_sum = Q6_V_vzero(); - - const uint32_t nb = n / qk; // num full blocks - const uint32_t nloe = n % qk; // num leftover elements - - uint32_t i = 0; - for (; i < nb; i++) { - HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q + i * y_qblk_size); - HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_full(r0_x_q + i * x_qblk_size); - HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8_full(r1_x_q + i * x_qblk_size); - HVX_Vector_x8 r2_q = hvx_vec_load_q4x4x8_full(r2_x_q + i * x_qblk_size); - HVX_Vector_x8 r3_q = hvx_vec_load_q4x4x8_full(r3_x_q + i * x_qblk_size); - - HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q)); - HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q)); - HVX_Vector r2_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r2_q, vy_q)); - HVX_Vector r3_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r3_q, vy_q)); - - HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size)); - HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); - HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size)); - HVX_Vector r2_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r2_x_d + i * x_dblk_size)); - HVX_Vector r3_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r3_x_d + i * x_dblk_size)); - - HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d))); - HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d))); - HVX_Vector r2_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r2_d, vy_d))); - HVX_Vector r3_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r3_d, vy_d))); - - HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); - HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd); - HVX_Vector r2_fa = Q6_Vqf32_vmpy_VsfVsf(r2_ia, r2_dd); - HVX_Vector r3_fa = Q6_Vqf32_vmpy_VsfVsf(r3_ia, r3_dd); - - r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); - r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum)); - r2_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r2_fa, r2_sum)); - r3_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r3_fa, r3_sum)); - } - - if (nloe) { - HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q + i * y_qblk_size, nloe); - HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_partial(r0_x_q + i * x_qblk_size, nloe); - HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8_partial(r1_x_q + i * x_qblk_size, nloe); - HVX_Vector_x8 r2_q = hvx_vec_load_q4x4x8_partial(r2_x_q + i * x_qblk_size, nloe); - HVX_Vector_x8 r3_q = hvx_vec_load_q4x4x8_partial(r3_x_q + i * x_qblk_size, nloe); - - HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe)); - HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy_q, nloe)); - HVX_Vector r2_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r2_q, vy_q, nloe)); - HVX_Vector r3_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r3_q, vy_q, nloe)); - - HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size)); - HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); - HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size)); - HVX_Vector r2_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r2_x_d + i * x_dblk_size)); - HVX_Vector r3_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r3_x_d + i * x_dblk_size)); - - HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d))); - HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d))); - HVX_Vector r2_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r2_d, vy_d))); - HVX_Vector r3_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r3_d, vy_d))); - - HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8); - r0_dd = Q6_V_vand_QV(bmask, r0_dd); - r1_dd = Q6_V_vand_QV(bmask, r1_dd); - r2_dd = Q6_V_vand_QV(bmask, r2_dd); - r3_dd = Q6_V_vand_QV(bmask, r3_dd); - r0_ia = Q6_V_vand_QV(bmask, r0_ia); - r1_ia = Q6_V_vand_QV(bmask, r1_ia); - r2_ia = Q6_V_vand_QV(bmask, r2_ia); - r3_ia = Q6_V_vand_QV(bmask, r3_ia); - - HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); - HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd); - HVX_Vector r2_fa = Q6_Vqf32_vmpy_VsfVsf(r2_ia, r2_dd); - HVX_Vector r3_fa = Q6_Vqf32_vmpy_VsfVsf(r3_ia, r3_dd); - - r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); - r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum)); - r2_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r2_fa, r2_sum)); - r3_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r3_fa, r3_sum)); - } - - HVX_Vector_x4 rsum_in = { .v = { r0_sum, r1_sum, r2_sum, r3_sum } }; - HVX_Vector rsum = hvx_vec_reduce_sum_f32x4(rsum_in); - hvx_vec_store_u(s0, 16, rsum); -} - - -static void vec_dot_q4x4x2_q8x4x2_2x2(const int n, float * restrict s0, float * restrict s1, - const void * restrict vx0, const void * restrict vx1, - const void * restrict vy0, const void * restrict vy1) { - assert(n % 32 == 0); - assert((unsigned long) vx0 % 128 == 0); - assert((unsigned long) vx1 % 128 == 0); - assert((unsigned long) vy0 % 128 == 0); - assert((unsigned long) vy1 % 128 == 0); - - const uint32_t qk = QK_Q4_0x4x2 * 4; - - const uint32_t x_dblk_size = 8 * 4 * 2; // 32x __fp16 - const uint32_t x_qblk_size = qk / 2; // int4 - const uint32_t x_qrow_size = n / 2; // int4 (not padded) - - const uint32_t y_dblk_size = 8 * 4 * 2; // 32x __fp16 - const uint32_t y_qblk_size = qk; // int8 - const uint32_t y_qrow_size = n; // int8 (not padded) - - const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0) + 0; // quants first - const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0) + x_qrow_size; // then scales - const uint8_t * restrict r1_x_q = ((const uint8_t *) vx1) + 0; // quants first - const uint8_t * restrict r1_x_d = ((const uint8_t *) vx1) + x_qrow_size; // then scales - - const uint8_t * restrict y0_q = ((const uint8_t *) vy0) + 0; // quants first - const uint8_t * restrict y0_d = ((const uint8_t *) vy0) + y_qrow_size; // then scales - const uint8_t * restrict y1_q = ((const uint8_t *) vy1) + 0; // quants first - const uint8_t * restrict y1_d = ((const uint8_t *) vy1) + y_qrow_size; // then scales - - // Row sums (sf) - 4 accumulators for 2×2 tile - HVX_Vector r0_c0_sum = Q6_V_vzero(); - HVX_Vector r0_c1_sum = Q6_V_vzero(); - HVX_Vector r1_c0_sum = Q6_V_vzero(); - HVX_Vector r1_c1_sum = Q6_V_vzero(); - - const uint32_t nb = n / qk; // num full blocks - const uint32_t nloe = n % qk; // num leftover elements - - uint32_t i = 0; - for (; i < nb; i++) { - // Load src1 columns (reused across both src0 rows) - HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8_full(y0_q + i * y_qblk_size); - HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8_full(y1_q + i * y_qblk_size); - - // Load src0 rows (reused across both src1 columns) - HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_full(r0_x_q + i * x_qblk_size); - HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8_full(r1_x_q + i * x_qblk_size); - - // Compute 4 dot products: r0×c0, r0×c1, r1×c0, r1×c1 - HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy0_q)); - HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy1_q)); - HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy0_q)); - HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy1_q)); - - // Load scales - HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d + i * y_dblk_size)); - HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d + i * y_dblk_size)); - HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); - HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size)); - - // Compute combined scales - HVX_Vector r0_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy0_d))); - HVX_Vector r0_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy1_d))); - HVX_Vector r1_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy0_d))); - HVX_Vector r1_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy1_d))); - - // Apply scales and accumulate - HVX_Vector r0_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c0_ia, r0_c0_dd); - HVX_Vector r0_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c1_ia, r0_c1_dd); - HVX_Vector r1_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c0_ia, r1_c0_dd); - HVX_Vector r1_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c1_ia, r1_c1_dd); - - r0_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c0_fa, r0_c0_sum)); - r0_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c1_fa, r0_c1_sum)); - r1_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c0_fa, r1_c0_sum)); - r1_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c1_fa, r1_c1_sum)); - } - - // Process leftovers - if (nloe) { - HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8_partial(y0_q + i * y_qblk_size, nloe); - HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8_partial(y1_q + i * y_qblk_size, nloe); - HVX_Vector_x8 r0_q = hvx_vec_load_q4x4x8_partial(r0_x_q + i * x_qblk_size, nloe); - HVX_Vector_x8 r1_q = hvx_vec_load_q4x4x8_partial(r1_x_q + i * x_qblk_size, nloe); - - HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy0_q, nloe)); - HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy1_q, nloe)); - HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy0_q, nloe)); - HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy1_q, nloe)); - - HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d + i * y_dblk_size)); - HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d + i * y_dblk_size)); - HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); - HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size)); - - HVX_Vector r0_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy0_d))); - HVX_Vector r0_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy1_d))); - HVX_Vector r1_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy0_d))); - HVX_Vector r1_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy1_d))); - - // Zero out unused scales - HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8); - r0_c0_dd = Q6_V_vand_QV(bmask, r0_c0_dd); - r0_c1_dd = Q6_V_vand_QV(bmask, r0_c1_dd); - r1_c0_dd = Q6_V_vand_QV(bmask, r1_c0_dd); - r1_c1_dd = Q6_V_vand_QV(bmask, r1_c1_dd); - r0_c0_ia = Q6_V_vand_QV(bmask, r0_c0_ia); - r0_c1_ia = Q6_V_vand_QV(bmask, r0_c1_ia); - r1_c0_ia = Q6_V_vand_QV(bmask, r1_c0_ia); - r1_c1_ia = Q6_V_vand_QV(bmask, r1_c1_ia); - - HVX_Vector r0_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c0_ia, r0_c0_dd); - HVX_Vector r0_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c1_ia, r0_c1_dd); - HVX_Vector r1_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c0_ia, r1_c0_dd); - HVX_Vector r1_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c1_ia, r1_c1_dd); - - r0_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c0_fa, r0_c0_sum)); - r0_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c1_fa, r0_c1_sum)); - r1_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c0_fa, r1_c0_sum)); - r1_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c1_fa, r1_c1_sum)); - } - - // Reduce and store results - HVX_Vector r0_r1_c0_sum = hvx_vec_reduce_sum_f32x2(r0_c0_sum, r1_c0_sum); - HVX_Vector r0_r1_c1_sum = hvx_vec_reduce_sum_f32x2(r0_c1_sum, r1_c1_sum); - - hvx_vec_store_u(s0, 8, r0_r1_c0_sum); // row0,col0 row1,col0 - hvx_vec_store_u(s1, 8, r0_r1_c1_sum); // row0,col1 row1,col1 -} - -static void vec_dot_q8x4x2_q8x4x2_1x1(const int n, float * restrict s0, const void * restrict vx0, const void * restrict vy0) { - assert(n % 32 == 0); // min sub-block size - assert((unsigned long) vx0 % 128 == 0); - assert((unsigned long) vy0 % 128 == 0); - - const uint32_t qk = QK_Q4_0x4x2 * 4; - - const uint32_t x_dblk_size = 8 * 4 * 2; // 32x __fp16 - const uint32_t x_qblk_size = qk; // int8 - const uint32_t x_qrow_size = n; // int8 (not padded) - - const uint32_t y_dblk_size = 8 * 4 * 2; // 32x __fp16 - const uint32_t y_qblk_size = qk; // int8 - const uint32_t y_qrow_size = n; // int8 (not padded) - - const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0 + 0); // quants first - const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0 + x_qrow_size); // then scales - - const uint8_t * restrict y_q = ((const uint8_t *) vy0 + 0); // quants first - const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales - - // Row sum (sf) - HVX_Vector r0_sum = Q6_V_vzero(); - - // Multiply and accumulate into int32. - // Compute combined scale (fp32). - // Apply scale to acc and accumulate into the row sum (qf32). - - const uint32_t nb = n / qk; // num full blocks - int32_t nloe = n % qk; // num leftover elemements (must be signed) - - uint32_t i = 0; - for (; i < nb; i++) { - HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q + i * y_qblk_size); - HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8_full(r0_x_q + i * x_qblk_size); - - HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q)); - - HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size)); - HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); - - HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d))); - - HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); - - r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); - } - - // Process leftovers - if (nloe) { - HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q + i * y_qblk_size, nloe); - HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8_partial(r0_x_q + i * x_qblk_size, nloe); - - HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe)); - - HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size)); - HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); - - HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d))); - - // Zero out unused elements - HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8); - r0_dd = Q6_V_vand_QV(bmask, r0_dd); - r0_ia = Q6_V_vand_QV(bmask, r0_ia); - - HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); - - r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); - } - - r0_sum = hvx_vec_reduce_sum_f32(r0_sum); - - hvx_vec_store_u(s0, 4, r0_sum); -} - -static void vec_dot_q8x4x2_q8x4x2_2x1(const int n, float * restrict s0, - const void * restrict vx0, const void * restrict vx1, - const void * restrict vy0) { - assert(n % 32 == 0); // min sub-block size - assert((unsigned long) vx0 % 128 == 0); - assert((unsigned long) vx1 % 128 == 0); - assert((unsigned long) vy0 % 128 == 0); - - const uint32_t qk = QK_Q4_0x4x2 * 4; - - const uint32_t x_dblk_size = 8 * 4 * 2; // 32x __fp16 - const uint32_t x_qblk_size = qk; // int8 - const uint32_t x_qrow_size = n; // int8 (not padded) - - const uint32_t y_dblk_size = 8 * 4 * 2; // 32x __fp16 - const uint32_t y_qblk_size = qk; // int8 - const uint32_t y_qrow_size = n; // int8 (not padded) - - const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0) + 0; // quants first - const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0) + x_qrow_size; // then scales - const uint8_t * restrict r1_x_q = ((const uint8_t *) vx1) + 0; // quants first - const uint8_t * restrict r1_x_d = ((const uint8_t *) vx1) + x_qrow_size; // then scales - - const uint8_t * restrict y_q = ((const uint8_t *) vy0 + 0); // quants first - const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales - - // Row sum (qf32) - HVX_Vector r0_sum = Q6_V_vzero(); - HVX_Vector r1_sum = Q6_V_vzero(); - - // Multiply and accumulate into int32. - // Compute combined scale (fp32). - // Apply scale to acc and accumulate into the row sum (qf32). - - const uint32_t nb = n / qk; // num full blocks - int32_t nloe = n % qk; // num leftover elemements (must be signed) - - uint32_t i = 0; - for (; i < nb; i++) { - HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q + i * y_qblk_size); - HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8_full(r0_x_q + i * x_qblk_size); - HVX_Vector_x8 r1_q = hvx_vec_load_q8x4x8_full(r1_x_q + i * x_qblk_size); - - HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q)); - HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q)); - - HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size)); - HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); - HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size)); - - HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d))); - HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d))); - - HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); - HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd); - - r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); - r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum)); - } - - // Process leftovers - if (nloe) { - HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q + i * y_qblk_size, nloe); - HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8_partial(r0_x_q + i * x_qblk_size, nloe); - HVX_Vector_x8 r1_q = hvx_vec_load_q8x4x8_partial(r1_x_q + i * x_qblk_size, nloe); - - HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe)); - HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy_q, nloe)); - - HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size)); - HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); - HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size)); - - HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d))); - HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d))); - - // Zero out unused elements - HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8); - r0_dd = Q6_V_vand_QV(bmask, r0_dd); - r1_dd = Q6_V_vand_QV(bmask, r1_dd); - r0_ia = Q6_V_vand_QV(bmask, r0_ia); - r1_ia = Q6_V_vand_QV(bmask, r1_ia); - - HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); - HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd); - - r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); - r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum)); - } - - HVX_Vector rsum = hvx_vec_reduce_sum_f32x2(r0_sum, r1_sum); - hvx_vec_store_u(s0, 8, rsum); -} - -static void vec_dot_q8x4x2_q8x4x2_4x1(const int n, float * restrict s0, - const void * restrict vx0, const void * restrict vx1, - const void * restrict vx2, const void * restrict vx3, - const void * restrict vy0) { - assert(n % 32 == 0); // min sub-block size - assert((unsigned long) vx0 % 128 == 0); - assert((unsigned long) vx1 % 128 == 0); - assert((unsigned long) vx2 % 128 == 0); - assert((unsigned long) vx3 % 128 == 0); - assert((unsigned long) vy0 % 128 == 0); - - const uint32_t qk = QK_Q4_0x4x2 * 4; - - const uint32_t x_dblk_size = 8 * 4 * 2; // 32x __fp16 - const uint32_t x_qblk_size = qk; // int8 - const uint32_t x_qrow_size = n; // int8 (not padded) - - const uint32_t y_dblk_size = 8 * 4 * 2; // 32x __fp16 - const uint32_t y_qblk_size = qk; // int8 - const uint32_t y_qrow_size = n; // int8 (not padded) - - const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0) + 0; // quants first - const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0) + x_qrow_size; // then scales - const uint8_t * restrict r1_x_q = ((const uint8_t *) vx1) + 0; // quants first - const uint8_t * restrict r1_x_d = ((const uint8_t *) vx1) + x_qrow_size; // then scales - const uint8_t * restrict r2_x_q = ((const uint8_t *) vx2) + 0; // quants first - const uint8_t * restrict r2_x_d = ((const uint8_t *) vx2) + x_qrow_size; // then scales - const uint8_t * restrict r3_x_q = ((const uint8_t *) vx3) + 0; // quants first - const uint8_t * restrict r3_x_d = ((const uint8_t *) vx3) + x_qrow_size; // then scales - - const uint8_t * restrict y_q = ((const uint8_t *) vy0 + 0); // quants first - const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales - - // Row sum (qf32) - HVX_Vector r0_sum = Q6_V_vzero(); - HVX_Vector r1_sum = Q6_V_vzero(); - HVX_Vector r2_sum = Q6_V_vzero(); - HVX_Vector r3_sum = Q6_V_vzero(); - - const uint32_t nb = n / qk; // num full blocks - int32_t nloe = n % qk; // num leftover elemements (must be signed) - - uint32_t i = 0; - for (; i < nb; i++) { - HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q + i * y_qblk_size); - HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8_full(r0_x_q + i * x_qblk_size); - HVX_Vector_x8 r1_q = hvx_vec_load_q8x4x8_full(r1_x_q + i * x_qblk_size); - HVX_Vector_x8 r2_q = hvx_vec_load_q8x4x8_full(r2_x_q + i * x_qblk_size); - HVX_Vector_x8 r3_q = hvx_vec_load_q8x4x8_full(r3_x_q + i * x_qblk_size); - - HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q)); - HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q)); - HVX_Vector r2_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r2_q, vy_q)); - HVX_Vector r3_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r3_q, vy_q)); - - HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size)); - HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); - HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size)); - HVX_Vector r2_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r2_x_d + i * x_dblk_size)); - HVX_Vector r3_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r3_x_d + i * x_dblk_size)); - - HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d))); - HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d))); - HVX_Vector r2_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r2_d, vy_d))); - HVX_Vector r3_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r3_d, vy_d))); - - HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); - HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd); - HVX_Vector r2_fa = Q6_Vqf32_vmpy_VsfVsf(r2_ia, r2_dd); - HVX_Vector r3_fa = Q6_Vqf32_vmpy_VsfVsf(r3_ia, r3_dd); - - r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); - r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum)); - r2_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r2_fa, r2_sum)); - r3_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r3_fa, r3_sum)); - } - - if (nloe) { - HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q + i * y_qblk_size, nloe); - HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8_partial(r0_x_q + i * x_qblk_size, nloe); - HVX_Vector_x8 r1_q = hvx_vec_load_q8x4x8_partial(r1_x_q + i * x_qblk_size, nloe); - HVX_Vector_x8 r2_q = hvx_vec_load_q8x4x8_partial(r2_x_q + i * x_qblk_size, nloe); - HVX_Vector_x8 r3_q = hvx_vec_load_q8x4x8_partial(r3_x_q + i * x_qblk_size, nloe); - - HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe)); - HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy_q, nloe)); - HVX_Vector r2_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r2_q, vy_q, nloe)); - HVX_Vector r3_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r3_q, vy_q, nloe)); - - HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size)); - HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); - HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size)); - HVX_Vector r2_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r2_x_d + i * x_dblk_size)); - HVX_Vector r3_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r3_x_d + i * x_dblk_size)); - - HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d))); - HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d))); - HVX_Vector r2_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r2_d, vy_d))); - HVX_Vector r3_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r3_d, vy_d))); - - HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8); - r0_dd = Q6_V_vand_QV(bmask, r0_dd); - r1_dd = Q6_V_vand_QV(bmask, r1_dd); - r2_dd = Q6_V_vand_QV(bmask, r2_dd); - r3_dd = Q6_V_vand_QV(bmask, r3_dd); - r0_ia = Q6_V_vand_QV(bmask, r0_ia); - r1_ia = Q6_V_vand_QV(bmask, r1_ia); - r2_ia = Q6_V_vand_QV(bmask, r2_ia); - r3_ia = Q6_V_vand_QV(bmask, r3_ia); - - HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); - HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd); - HVX_Vector r2_fa = Q6_Vqf32_vmpy_VsfVsf(r2_ia, r2_dd); - HVX_Vector r3_fa = Q6_Vqf32_vmpy_VsfVsf(r3_ia, r3_dd); - - r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); - r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum)); - r2_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r2_fa, r2_sum)); - r3_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r3_fa, r3_sum)); - } - - HVX_Vector_x4 rsum_in = { .v = { r0_sum, r1_sum, r2_sum, r3_sum } }; - HVX_Vector rsum = hvx_vec_reduce_sum_f32x4(rsum_in); - hvx_vec_store_u(s0, 16, rsum); -} - - -static void vec_dot_q8x4x2_q8x4x2_2x2(const int n, float * restrict s0, float * restrict s1, - const void * restrict vx0, const void * restrict vx1, - const void * restrict vy0, const void * restrict vy1) { - assert(n % 32 == 0); - assert((unsigned long) vx0 % 128 == 0); - assert((unsigned long) vx1 % 128 == 0); - assert((unsigned long) vy0 % 128 == 0); - assert((unsigned long) vy1 % 128 == 0); - - const uint32_t qk = QK_Q8_0x4x2 * 4; - - const uint32_t x_dblk_size = 8 * 4 * 2; // 32x __fp16 - const uint32_t x_qblk_size = qk; // int8 - const uint32_t x_qrow_size = n; // int8 (not padded) - - const uint32_t y_dblk_size = 8 * 4 * 2; // 32x __fp16 - const uint32_t y_qblk_size = qk; // int8 - const uint32_t y_qrow_size = n; // int8 (not padded) - - const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0) + 0; // quants first - const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0) + x_qrow_size; // then scales - const uint8_t * restrict r1_x_q = ((const uint8_t *) vx1) + 0; // quants first - const uint8_t * restrict r1_x_d = ((const uint8_t *) vx1) + x_qrow_size; // then scales - - const uint8_t * restrict y0_q = ((const uint8_t *) vy0) + 0; // quants first - const uint8_t * restrict y0_d = ((const uint8_t *) vy0) + y_qrow_size; // then scales - const uint8_t * restrict y1_q = ((const uint8_t *) vy1) + 0; // quants first - const uint8_t * restrict y1_d = ((const uint8_t *) vy1) + y_qrow_size; // then scales - - // Row sums (sf) - 4 accumulators for 2×2 tile - HVX_Vector r0_c0_sum = Q6_V_vzero(); - HVX_Vector r0_c1_sum = Q6_V_vzero(); - HVX_Vector r1_c0_sum = Q6_V_vzero(); - HVX_Vector r1_c1_sum = Q6_V_vzero(); - - const uint32_t nb = n / qk; // num full blocks - const uint32_t nloe = n % qk; // num leftover elements - - uint32_t i = 0; - for (; i < nb; i++) { - // Load src1 columns (reused across both src0 rows) - HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8_full(y0_q + i * y_qblk_size); - HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8_full(y1_q + i * y_qblk_size); - - // Load src0 rows (reused across both src1 columns) - HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8_full(r0_x_q + i * x_qblk_size); - HVX_Vector_x8 r1_q = hvx_vec_load_q8x4x8_full(r1_x_q + i * x_qblk_size); - - // Compute 4 dot products: r0×c0, r0×c1, r1×c0, r1×c1 - HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy0_q)); - HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy1_q)); - HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy0_q)); - HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy1_q)); - - // Load scales - HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d + i * y_dblk_size)); - HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d + i * y_dblk_size)); - HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); - HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size)); - - // Compute combined scales - HVX_Vector r0_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy0_d))); - HVX_Vector r0_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy1_d))); - HVX_Vector r1_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy0_d))); - HVX_Vector r1_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy1_d))); - - // Apply scales and accumulate - HVX_Vector r0_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c0_ia, r0_c0_dd); - HVX_Vector r0_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c1_ia, r0_c1_dd); - HVX_Vector r1_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c0_ia, r1_c0_dd); - HVX_Vector r1_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c1_ia, r1_c1_dd); - - r0_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c0_fa, r0_c0_sum)); - r0_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c1_fa, r0_c1_sum)); - r1_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c0_fa, r1_c0_sum)); - r1_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c1_fa, r1_c1_sum)); - } - - // Process leftovers - if (nloe) { - HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8_partial(y0_q + i * y_qblk_size, nloe); - HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8_partial(y1_q + i * y_qblk_size, nloe); - HVX_Vector_x8 r0_q = hvx_vec_load_q8x4x8_partial(r0_x_q + i * x_qblk_size, nloe); - HVX_Vector_x8 r1_q = hvx_vec_load_q8x4x8_partial(r1_x_q + i * x_qblk_size, nloe); - - HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy0_q, nloe)); - HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy1_q, nloe)); - HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy0_q, nloe)); - HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy1_q, nloe)); - - HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d + i * y_dblk_size)); - HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d + i * y_dblk_size)); - HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); - HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size)); - - HVX_Vector r0_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy0_d))); - HVX_Vector r0_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy1_d))); - HVX_Vector r1_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy0_d))); - HVX_Vector r1_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy1_d))); - - // Zero out unused elements - HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8); - r0_c0_dd = Q6_V_vand_QV(bmask, r0_c0_dd); - r0_c1_dd = Q6_V_vand_QV(bmask, r0_c1_dd); - r1_c0_dd = Q6_V_vand_QV(bmask, r1_c0_dd); - r1_c1_dd = Q6_V_vand_QV(bmask, r1_c1_dd); - r0_c0_ia = Q6_V_vand_QV(bmask, r0_c0_ia); - r0_c1_ia = Q6_V_vand_QV(bmask, r0_c1_ia); - r1_c0_ia = Q6_V_vand_QV(bmask, r1_c0_ia); - r1_c1_ia = Q6_V_vand_QV(bmask, r1_c1_ia); - - HVX_Vector r0_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c0_ia, r0_c0_dd); - HVX_Vector r0_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c1_ia, r0_c1_dd); - HVX_Vector r1_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c0_ia, r1_c0_dd); - HVX_Vector r1_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c1_ia, r1_c1_dd); - - r0_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c0_fa, r0_c0_sum)); - r0_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c1_fa, r0_c1_sum)); - r1_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c0_fa, r1_c0_sum)); - r1_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c1_fa, r1_c1_sum)); - } - - // Reduce and store results - HVX_Vector r0_r1_c0_sum = hvx_vec_reduce_sum_f32x2(r0_c0_sum, r1_c0_sum); - HVX_Vector r0_r1_c1_sum = hvx_vec_reduce_sum_f32x2(r0_c1_sum, r1_c1_sum); - - hvx_vec_store_u(&s0[0], 8, r0_r1_c0_sum); // row0,col0 row1,col0 - hvx_vec_store_u(&s1[0], 8, r0_r1_c1_sum); // row0,col1 row1,col1 -} - -// ======== IQ4_NL x Q8_0 vec_dot kernels ======== -// Same structure as Q4_0 vec_dot but uses IQ4_NL LUT-based load (4-bit index -> int8 kvalue). -// Scale format is identical to Q4_0 (fp16 scales). - -static void vec_dot_iq4nlx4x2_q8x4x2_1x1(const int n, - float * restrict s0, - const void * restrict vx0, - const void * restrict vy0) { - assert(n % 32 == 0); - assert((unsigned long) vx0 % 128 == 0); - assert((unsigned long) vy0 % 128 == 0); - - const uint32_t qk = QK_Q4_0x4x2 * 4; - - const uint32_t x_dblk_size = 8 * 4 * 2; // 32x __fp16 - const uint32_t x_qblk_size = qk / 2; // int4 - const uint32_t x_qrow_size = n / 2; // int4 (not padded) - - const uint32_t y_dblk_size = 8 * 4 * 2; // 32x __fp16 - const uint32_t y_qblk_size = qk; // int8 - const uint32_t y_qrow_size = n; // int8 (not padded) - - const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0 + 0); // quants first - const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0 + x_qrow_size); // then scales - - const uint8_t * restrict y_q = ((const uint8_t *) vy0 + 0); // quants first - const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales - - HVX_Vector r0_sum = Q6_V_vzero(); - - const uint32_t nb = n / qk; - const uint32_t nloe = n % qk; - - uint32_t i = 0; - for (; i < nb; i++) { - HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q + i * y_qblk_size); - HVX_Vector_x8 r0_q = hvx_vec_load_iq4nlx4x8_full(r0_x_q + i * x_qblk_size); - - HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q)); - - HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size)); - HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); - - HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d))); - - HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); - - r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); - } - - if (nloe) { - HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q + i * y_qblk_size, nloe); - HVX_Vector_x8 r0_q = hvx_vec_load_iq4nlx4x8_partial(r0_x_q + i * x_qblk_size, nloe); - - HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe)); - - HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size)); - HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); - - HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d))); - - HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8); - r0_dd = Q6_V_vand_QV(bmask, r0_dd); - r0_ia = Q6_V_vand_QV(bmask, r0_ia); - - HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); - - r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); - } - - r0_sum = hvx_vec_reduce_sum_f32(r0_sum); - - hvx_vec_store_u(s0, 4, r0_sum); -} - -static void vec_dot_iq4nlx4x2_q8x4x2_2x1(const int n, - float * restrict s0, - const void * restrict vx0, - const void * restrict vx1, - const void * restrict vy0) { - assert(n % 32 == 0); - assert((unsigned long) vx0 % 128 == 0); - assert((unsigned long) vx1 % 128 == 0); - assert((unsigned long) vy0 % 128 == 0); - - const uint32_t qk = QK_Q4_0x4x2 * 4; - - const uint32_t x_dblk_size = 8 * 4 * 2; // 32x __fp16 - const uint32_t x_qblk_size = qk / 2; // int4 - const uint32_t x_qrow_size = n / 2; // int4 (not padded) - - const uint32_t y_dblk_size = 8 * 4 * 2; // 32x __fp16 - const uint32_t y_qblk_size = qk; // int8 - const uint32_t y_qrow_size = n; // int8 (not padded) - - const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0) + 0; // quants first - const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0) + x_qrow_size; // then scales - const uint8_t * restrict r1_x_q = ((const uint8_t *) vx1) + 0; // quants first - const uint8_t * restrict r1_x_d = ((const uint8_t *) vx1) + x_qrow_size; // then scales - - const uint8_t * restrict y_q = ((const uint8_t *) vy0 + 0); // quants first - const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales - - HVX_Vector r0_sum = Q6_V_vzero(); - HVX_Vector r1_sum = Q6_V_vzero(); - - const uint32_t nb = n / qk; - const uint32_t nloe = n % qk; - - uint32_t i = 0; - for (; i < nb; i++) { - HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q + i * y_qblk_size); - HVX_Vector_x8 r0_q = hvx_vec_load_iq4nlx4x8_full(r0_x_q + i * x_qblk_size); - HVX_Vector_x8 r1_q = hvx_vec_load_iq4nlx4x8_full(r1_x_q + i * x_qblk_size); - - HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q)); - HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q)); - - HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size)); - HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); - HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size)); - - HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d))); - HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d))); - - HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); - HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd); - - r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); - r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum)); - } - - if (nloe) { - HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q + i * y_qblk_size, nloe); - HVX_Vector_x8 r0_q = hvx_vec_load_iq4nlx4x8_partial(r0_x_q + i * x_qblk_size, nloe); - HVX_Vector_x8 r1_q = hvx_vec_load_iq4nlx4x8_partial(r1_x_q + i * x_qblk_size, nloe); - - HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe)); - HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy_q, nloe)); - - HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size)); - HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); - HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size)); - - HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d))); - HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d))); - - HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8); - r0_dd = Q6_V_vand_QV(bmask, r0_dd); - r1_dd = Q6_V_vand_QV(bmask, r1_dd); - r0_ia = Q6_V_vand_QV(bmask, r0_ia); - r1_ia = Q6_V_vand_QV(bmask, r1_ia); - - HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); - HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd); - - r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); - r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum)); - } - - HVX_Vector rsum = hvx_vec_reduce_sum_f32x2(r0_sum, r1_sum); - hvx_vec_store_u(s0, 8, rsum); -} - -static void vec_dot_iq4nlx4x2_q8x4x2_4x1(const int n, - float * restrict s0, - const void * restrict vx0, - const void * restrict vx1, - const void * restrict vx2, - const void * restrict vx3, - const void * restrict vy0) { - assert(n % 32 == 0); - assert((unsigned long) vx0 % 128 == 0); - assert((unsigned long) vx1 % 128 == 0); - assert((unsigned long) vx2 % 128 == 0); - assert((unsigned long) vx3 % 128 == 0); - assert((unsigned long) vy0 % 128 == 0); - - const uint32_t qk = QK_Q4_0x4x2 * 4; - - const uint32_t x_dblk_size = 8 * 4 * 2; // 32x __fp16 - const uint32_t x_qblk_size = qk / 2; // int4 - const uint32_t x_qrow_size = n / 2; // int4 (not padded) - - const uint32_t y_dblk_size = 8 * 4 * 2; // 32x __fp16 - const uint32_t y_qblk_size = qk; // int8 - const uint32_t y_qrow_size = n; // int8 (not padded) - - const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0) + 0; // quants first - const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0) + x_qrow_size; // then scales - const uint8_t * restrict r1_x_q = ((const uint8_t *) vx1) + 0; // quants first - const uint8_t * restrict r1_x_d = ((const uint8_t *) vx1) + x_qrow_size; // then scales - const uint8_t * restrict r2_x_q = ((const uint8_t *) vx2) + 0; // quants first - const uint8_t * restrict r2_x_d = ((const uint8_t *) vx2) + x_qrow_size; // then scales - const uint8_t * restrict r3_x_q = ((const uint8_t *) vx3) + 0; // quants first - const uint8_t * restrict r3_x_d = ((const uint8_t *) vx3) + x_qrow_size; // then scales - - const uint8_t * restrict y_q = ((const uint8_t *) vy0 + 0); // quants first - const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales - - HVX_Vector r0_sum = Q6_V_vzero(); - HVX_Vector r1_sum = Q6_V_vzero(); - HVX_Vector r2_sum = Q6_V_vzero(); - HVX_Vector r3_sum = Q6_V_vzero(); - - const uint32_t nb = n / qk; - const uint32_t nloe = n % qk; - - uint32_t i = 0; - for (; i < nb; i++) { - HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full(y_q + i * y_qblk_size); - HVX_Vector_x8 r0_q = hvx_vec_load_iq4nlx4x8_full(r0_x_q + i * x_qblk_size); - HVX_Vector_x8 r1_q = hvx_vec_load_iq4nlx4x8_full(r1_x_q + i * x_qblk_size); - HVX_Vector_x8 r2_q = hvx_vec_load_iq4nlx4x8_full(r2_x_q + i * x_qblk_size); - HVX_Vector_x8 r3_q = hvx_vec_load_iq4nlx4x8_full(r3_x_q + i * x_qblk_size); - - HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q)); - HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q)); - HVX_Vector r2_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r2_q, vy_q)); - HVX_Vector r3_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r3_q, vy_q)); - - HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size)); - HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); - HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size)); - HVX_Vector r2_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r2_x_d + i * x_dblk_size)); - HVX_Vector r3_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r3_x_d + i * x_dblk_size)); - - HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d))); - HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d))); - HVX_Vector r2_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r2_d, vy_d))); - HVX_Vector r3_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r3_d, vy_d))); - - HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); - HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd); - HVX_Vector r2_fa = Q6_Vqf32_vmpy_VsfVsf(r2_ia, r2_dd); - HVX_Vector r3_fa = Q6_Vqf32_vmpy_VsfVsf(r3_ia, r3_dd); - - r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); - r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum)); - r2_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r2_fa, r2_sum)); - r3_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r3_fa, r3_sum)); - } - - if (nloe) { - HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial(y_q + i * y_qblk_size, nloe); - HVX_Vector_x8 r0_q = hvx_vec_load_iq4nlx4x8_partial(r0_x_q + i * x_qblk_size, nloe); - HVX_Vector_x8 r1_q = hvx_vec_load_iq4nlx4x8_partial(r1_x_q + i * x_qblk_size, nloe); - HVX_Vector_x8 r2_q = hvx_vec_load_iq4nlx4x8_partial(r2_x_q + i * x_qblk_size, nloe); - HVX_Vector_x8 r3_q = hvx_vec_load_iq4nlx4x8_partial(r3_x_q + i * x_qblk_size, nloe); - - HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe)); - HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy_q, nloe)); - HVX_Vector r2_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r2_q, vy_q, nloe)); - HVX_Vector r3_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r3_q, vy_q, nloe)); - - HVX_Vector vy_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y_d + i * y_dblk_size)); - HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); - HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size)); - HVX_Vector r2_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r2_x_d + i * x_dblk_size)); - HVX_Vector r3_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r3_x_d + i * x_dblk_size)); - - HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy_d))); - HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy_d))); - HVX_Vector r2_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r2_d, vy_d))); - HVX_Vector r3_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r3_d, vy_d))); - - HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8); - r0_dd = Q6_V_vand_QV(bmask, r0_dd); - r1_dd = Q6_V_vand_QV(bmask, r1_dd); - r2_dd = Q6_V_vand_QV(bmask, r2_dd); - r3_dd = Q6_V_vand_QV(bmask, r3_dd); - r0_ia = Q6_V_vand_QV(bmask, r0_ia); - r1_ia = Q6_V_vand_QV(bmask, r1_ia); - r2_ia = Q6_V_vand_QV(bmask, r2_ia); - r3_ia = Q6_V_vand_QV(bmask, r3_ia); - - HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); - HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd); - HVX_Vector r2_fa = Q6_Vqf32_vmpy_VsfVsf(r2_ia, r2_dd); - HVX_Vector r3_fa = Q6_Vqf32_vmpy_VsfVsf(r3_ia, r3_dd); - - r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); - r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum)); - r2_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r2_fa, r2_sum)); - r3_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r3_fa, r3_sum)); - } - - HVX_Vector_x4 rsum_in = { .v = { r0_sum, r1_sum, r2_sum, r3_sum } }; - HVX_Vector rsum = hvx_vec_reduce_sum_f32x4(rsum_in); - hvx_vec_store_u(s0, 16, rsum); -} - - -static void vec_dot_iq4nlx4x2_q8x4x2_2x2(const int n, - float * restrict s0, - float * restrict s1, - const void * restrict vx0, - const void * restrict vx1, - const void * restrict vy0, - const void * restrict vy1) { - assert(n % 32 == 0); - assert((unsigned long) vx0 % 128 == 0); - assert((unsigned long) vx1 % 128 == 0); - assert((unsigned long) vy0 % 128 == 0); - assert((unsigned long) vy1 % 128 == 0); - - const uint32_t qk = QK_Q4_0x4x2 * 4; - - const uint32_t x_dblk_size = 8 * 4 * 2; // 32x __fp16 - const uint32_t x_qblk_size = qk / 2; // int4 - const uint32_t x_qrow_size = n / 2; // int4 (not padded) - - const uint32_t y_dblk_size = 8 * 4 * 2; // 32x __fp16 - const uint32_t y_qblk_size = qk; // int8 - const uint32_t y_qrow_size = n; // int8 (not padded) - - const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0) + 0; - const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0) + x_qrow_size; - const uint8_t * restrict r1_x_q = ((const uint8_t *) vx1) + 0; - const uint8_t * restrict r1_x_d = ((const uint8_t *) vx1) + x_qrow_size; - - const uint8_t * restrict y0_q = ((const uint8_t *) vy0) + 0; - const uint8_t * restrict y0_d = ((const uint8_t *) vy0) + y_qrow_size; - const uint8_t * restrict y1_q = ((const uint8_t *) vy1) + 0; - const uint8_t * restrict y1_d = ((const uint8_t *) vy1) + y_qrow_size; - - HVX_Vector r0_c0_sum = Q6_V_vzero(); - HVX_Vector r0_c1_sum = Q6_V_vzero(); - HVX_Vector r1_c0_sum = Q6_V_vzero(); - HVX_Vector r1_c1_sum = Q6_V_vzero(); - - const uint32_t nb = n / qk; - const uint32_t nloe = n % qk; - - uint32_t i = 0; - for (; i < nb; i++) { - HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8_full(y0_q + i * y_qblk_size); - HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8_full(y1_q + i * y_qblk_size); - HVX_Vector_x8 r0_q = hvx_vec_load_iq4nlx4x8_full(r0_x_q + i * x_qblk_size); - HVX_Vector_x8 r1_q = hvx_vec_load_iq4nlx4x8_full(r1_x_q + i * x_qblk_size); - - HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy0_q)); - HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy1_q)); - HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy0_q)); - HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy1_q)); - - HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d + i * y_dblk_size)); - HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d + i * y_dblk_size)); - HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); - HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size)); - - HVX_Vector r0_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy0_d))); - HVX_Vector r0_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy1_d))); - HVX_Vector r1_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy0_d))); - HVX_Vector r1_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy1_d))); - - HVX_Vector r0_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c0_ia, r0_c0_dd); - HVX_Vector r0_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c1_ia, r0_c1_dd); - HVX_Vector r1_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c0_ia, r1_c0_dd); - HVX_Vector r1_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c1_ia, r1_c1_dd); - - r0_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c0_fa, r0_c0_sum)); - r0_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c1_fa, r0_c1_sum)); - r1_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c0_fa, r1_c0_sum)); - r1_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c1_fa, r1_c1_sum)); - } - - if (nloe) { - HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8_partial(y0_q + i * y_qblk_size, nloe); - HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8_partial(y1_q + i * y_qblk_size, nloe); - HVX_Vector_x8 r0_q = hvx_vec_load_iq4nlx4x8_partial(r0_x_q + i * x_qblk_size, nloe); - HVX_Vector_x8 r1_q = hvx_vec_load_iq4nlx4x8_partial(r1_x_q + i * x_qblk_size, nloe); - - HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy0_q, nloe)); - HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy1_q, nloe)); - HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy0_q, nloe)); - HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy1_q, nloe)); - - HVX_Vector vy0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y0_d + i * y_dblk_size)); - HVX_Vector vy1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (y1_d + i * y_dblk_size)); - HVX_Vector r0_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r0_x_d + i * x_dblk_size)); - HVX_Vector r1_d = Q6_Vh_vshuff_Vh(*(const HVX_UVector *) (r1_x_d + i * x_dblk_size)); - - HVX_Vector r0_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy0_d))); - HVX_Vector r0_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r0_d, vy1_d))); - HVX_Vector r1_c0_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy0_d))); - HVX_Vector r1_c1_dd = Q6_Vsf_equals_Vqf32(Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(r1_d, vy1_d))); - - HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8); - r0_c0_dd = Q6_V_vand_QV(bmask, r0_c0_dd); - r0_c1_dd = Q6_V_vand_QV(bmask, r0_c1_dd); - r1_c0_dd = Q6_V_vand_QV(bmask, r1_c0_dd); - r1_c1_dd = Q6_V_vand_QV(bmask, r1_c1_dd); - r0_c0_ia = Q6_V_vand_QV(bmask, r0_c0_ia); - r0_c1_ia = Q6_V_vand_QV(bmask, r0_c1_ia); - r1_c0_ia = Q6_V_vand_QV(bmask, r1_c0_ia); - r1_c1_ia = Q6_V_vand_QV(bmask, r1_c1_ia); - - HVX_Vector r0_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c0_ia, r0_c0_dd); - HVX_Vector r0_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c1_ia, r0_c1_dd); - HVX_Vector r1_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c0_ia, r1_c0_dd); - HVX_Vector r1_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c1_ia, r1_c1_dd); - - r0_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c0_fa, r0_c0_sum)); - r0_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c1_fa, r0_c1_sum)); - r1_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c0_fa, r1_c0_sum)); - r1_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c1_fa, r1_c1_sum)); - } - - HVX_Vector r0_r1_c0_sum = hvx_vec_reduce_sum_f32x2(r0_c0_sum, r1_c0_sum); - HVX_Vector r0_r1_c1_sum = hvx_vec_reduce_sum_f32x2(r0_c1_sum, r1_c1_sum); - - hvx_vec_store_u(&s0[0], 8, r0_r1_c0_sum); - hvx_vec_store_u(&s1[0], 8, r0_r1_c1_sum); -} - -static void vec_dot_mxfp4x4x2_q8x4x2_1x1(const int n, float * restrict s0, const void * restrict vx0, const void * restrict vy0) { - assert(n % 32 == 0); // min sub-block size - assert((unsigned long) vx0 % 128 == 0); - assert((unsigned long) vy0 % 128 == 0); - - const uint32_t qk = QK_MXFP4x4x2 * 4; - - const uint32_t x_dblk_size = 8 * 4 * 1; // 32x e8m0 - const uint32_t x_qblk_size = qk / 2; // fp4 - const uint32_t x_qrow_size = n / 2; // fp4 (not padded) - - const uint32_t y_dblk_size = 8 * 4 * 2; // 32x __fp16 - const uint32_t y_qblk_size = qk; // int8 - const uint32_t y_qrow_size = n; // int8 (not padded) - - const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0 + 0); // quants first - const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0 + x_qrow_size); // then scales - - const uint8_t * restrict y_q = ((const uint8_t *) vy0 + 0); // quants first - const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales - - // Row sum (sf) - HVX_Vector r0_sum = Q6_V_vzero(); - - // Multiply and accumulate into int32. - // Compute combined scale (fp32). - // Apply scale to acc and accumulate into the row sum (qf32). - - const uint32_t nb = n / qk; // num full blocks - int32_t nloe = n % qk; // num leftover elemements (must be signed) - - uint32_t i = 0; - for (; i < nb; i++) { - HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full( y_q + i * y_qblk_size); - HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8_full(r0_x_q + i * x_qblk_size); - - HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q)); - - HVX_Vector vy_d = *(const HVX_UVector *) (y_d + i * y_dblk_size); - HVX_Vector r0_d = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size); - - // Convert vy_d from fp16 to fp32 while applying 0.5 scaling which is used for e8m0 halving - HVX_Vector half = Q6_Vh_vsplat_R(0x3800); // 0.5 in fp16 - vy_d = Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(vy_d), half)); - vy_d = Q6_Vsf_equals_Vqf32(vy_d); - - // Convert rX_d scales from e8m0 to fp32 - // Expand and zero-pad 32x uint8 e8m0 values to uint32s : 0 0 0 0, 0 0 0 1, 0 0 0 2, ... - // Left shift with zero fill to create FP32 - // FIXME: might need to handle zero as a special case (see ggml-cpu code) - HVX_Vector expand = *(const HVX_Vector *) expand_x32_e8m0; - HVX_Vector e8m0_mask = Q6_V_vsplat_R(0x000000ff); - r0_d = Q6_V_vdelta_VV(r0_d, expand); - r0_d = Q6_V_vand_VV(r0_d, e8m0_mask); - r0_d = Q6_Vw_vasl_VwR(r0_d, 23); - - HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r0_d, vy_d)); - - HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); - - r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); - } - - // Process leftovers - if (nloe) { - HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial( y_q + i * y_qblk_size, nloe); - HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8_partial(r0_x_q + i * x_qblk_size, nloe); - - HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy_q, nloe)); - - HVX_Vector vy_d = *(const HVX_UVector *) (y_d + i * y_dblk_size); - HVX_Vector r0_d = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size); - - // Convert vy_d from fp16 to fp32 while applying 0.5 scaling which is used for e8m0 halving - HVX_Vector half = Q6_Vh_vsplat_R(0x3800); // 0.5 in fp16 - vy_d = Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(vy_d), half)); - vy_d = Q6_Vsf_equals_Vqf32(vy_d); - - // Convert rX_d scales from e8m0 to fp32 - // Expand and zero-pad 32x uint8 e8m0 values to uint32s : 0 0 0 0, 0 0 0 1, 0 0 0 2, ... - // Left shift with zero fill to create FP32 - // FIXME: might need to handle zero as a special case (see ggml-cpu code) - HVX_Vector expand = *(const HVX_Vector *) expand_x32_e8m0; - HVX_Vector e8m0_mask = Q6_V_vsplat_R(0x000000ff); - r0_d = Q6_V_vdelta_VV(r0_d, expand); - r0_d = Q6_V_vand_VV(r0_d, e8m0_mask); - r0_d = Q6_Vw_vasl_VwR(r0_d, 23); - - HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r0_d, vy_d)); - - // Zero-out unused scales - HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8); - r0_dd = Q6_V_vand_QV(bmask, r0_dd); - r0_ia = Q6_V_vand_QV(bmask, r0_ia); - - HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); - - r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); - } - - r0_sum = hvx_vec_reduce_sum_f32(r0_sum); - - hvx_vec_store_u(s0, 4, r0_sum); -} - -static void vec_dot_mxfp4x4x2_q8x4x2_2x1(const int n, float * restrict s0, - const void * restrict vx0, const void * restrict vx1, - const void * restrict vy0) { - assert(n % 32 == 0); // min sub-block size - assert((unsigned long) vx0 % 128 == 0); - assert((unsigned long) vx1 % 128 == 0); - assert((unsigned long) vy0 % 128 == 0); - - const uint32_t qk = QK_MXFP4x4x2 * 4; - - const uint32_t x_dblk_size = 8 * 4 * 1; // 32x e8m0 - const uint32_t x_qblk_size = qk / 2; // fp4 - const uint32_t x_qrow_size = n / 2; // fp4 (not padded) - - const uint32_t y_dblk_size = 8 * 4 * 2; // 32x __fp16 - const uint32_t y_qblk_size = qk; // int8 - const uint32_t y_qrow_size = n; // int8 (not padded) - - const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0) + 0; // quants first - const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0) + x_qrow_size; // then scales - const uint8_t * restrict r1_x_q = ((const uint8_t *) vx1) + 0; // quants first - const uint8_t * restrict r1_x_d = ((const uint8_t *) vx1) + x_qrow_size; // then scales - - const uint8_t * restrict y_q = ((const uint8_t *) vy0) + 0; // quants first - const uint8_t * restrict y_d = ((const uint8_t *) vy0) + y_qrow_size; // then scales - - // Row sum (sf) - HVX_Vector r0_sum = Q6_V_vzero(); - HVX_Vector r1_sum = Q6_V_vzero(); - - // Multiply and accumulate into int32. - // Compute combined scale (fp32). - // Apply scale to acc and accumulate into the row sum (f32). - - const uint32_t nb = n / qk; // num full blocks - int32_t nloe = n % qk; // num leftover elemements (must be signed) - - uint32_t i = 0; - for (; i < nb; i++) { - HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full( y_q + i * y_qblk_size); - HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8_full(r0_x_q + i * x_qblk_size); - HVX_Vector_x8 r1_q = hvx_vec_load_mxfp4x4x8_full(r1_x_q + i * x_qblk_size); - - HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q)); - HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q)); - - HVX_Vector vy_d = *(const HVX_UVector *) (y_d + i * y_dblk_size); - HVX_Vector r0_d = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size); - HVX_Vector r1_d = *(const HVX_UVector *) (r1_x_d + i * x_dblk_size); - - // Convert vy_d from fp16 to fp32 while applying 0.5 scaling which is used for e8m0 halving - HVX_Vector half = Q6_Vh_vsplat_R(0x3800); // 0.5 in fp16 - vy_d = Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(vy_d), half)); - vy_d = Q6_Vsf_equals_Vqf32(vy_d); - - // Convert rX_d scales from e8m0 to fp32 - // Expand and zero-pad 32x uint8 e8m0 values to uint32s : 0 0 0 0, 0 0 0 1, 0 0 0 2, ... - // Left shift with zero fill to create FP32 - // FIXME: might need to handle zero as a special case (see ggml-cpu code) - HVX_Vector expand = *(const HVX_Vector *) expand_x32_e8m0; - HVX_Vector e8m0_mask = Q6_V_vsplat_R(0x000000ff); - r0_d = Q6_V_vdelta_VV(r0_d, expand); - r0_d = Q6_V_vand_VV(r0_d, e8m0_mask); - r0_d = Q6_Vw_vasl_VwR(r0_d, 23); - r1_d = Q6_V_vdelta_VV(r1_d, expand); - r1_d = Q6_V_vand_VV(r1_d, e8m0_mask); - r1_d = Q6_Vw_vasl_VwR(r1_d, 23); - - HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r0_d, vy_d)); - HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r1_d, vy_d)); - - HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); - HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd); - - r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); - r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum)); - } - - // Process leftovers - if (nloe) { - HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial( y_q + i * y_qblk_size, nloe); - HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8_partial(r0_x_q + i * x_qblk_size, nloe); - HVX_Vector_x8 r1_q = hvx_vec_load_mxfp4x4x8_partial(r1_x_q + i * x_qblk_size, nloe); - - HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q)); - HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q)); - - HVX_Vector vy_d = *(const HVX_UVector *) (y_d + i * y_dblk_size); - HVX_Vector r0_d = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size); - HVX_Vector r1_d = *(const HVX_UVector *) (r1_x_d + i * x_dblk_size); - - // Convert vy_d from fp16 to fp32 while applying 0.5 scaling which is used for e8m0 halving - HVX_Vector half = Q6_Vh_vsplat_R(0x3800); // 0.5 in fp16 - vy_d = Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(vy_d), half)); - vy_d = Q6_Vsf_equals_Vqf32(vy_d); - - // Convert rX_d scales from e8m0 to fp32 - // Expand and zero-pad 32x uint8 e8m0 values to uint32s : 0 0 0 0, 0 0 0 1, 0 0 0 2, ... - // Left shift with zero fill to create FP32 - // FIXME: might need to handle zero as a special case (see ggml-cpu code) - HVX_Vector expand = *(const HVX_Vector *) expand_x32_e8m0; - HVX_Vector e8m0_mask = Q6_V_vsplat_R(0x000000ff); - r0_d = Q6_V_vdelta_VV(r0_d, expand); - r0_d = Q6_V_vand_VV(r0_d, e8m0_mask); - r0_d = Q6_Vw_vasl_VwR(r0_d, 23); - r1_d = Q6_V_vdelta_VV(r1_d, expand); - r1_d = Q6_V_vand_VV(r1_d, e8m0_mask); - r1_d = Q6_Vw_vasl_VwR(r1_d, 23); - - HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r0_d, vy_d)); - HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r1_d, vy_d)); - - // Zero-out unused values - HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8); - r0_dd = Q6_V_vand_QV(bmask, r0_dd); - r1_dd = Q6_V_vand_QV(bmask, r1_dd); - r0_ia = Q6_V_vand_QV(bmask, r0_ia); - r1_ia = Q6_V_vand_QV(bmask, r1_ia); - - HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); - HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd); - - r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); - r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum)); - } - - HVX_Vector rsum = hvx_vec_reduce_sum_f32x2(r0_sum, r1_sum); - hvx_vec_store_u(s0, 8, rsum); -} - -static void vec_dot_mxfp4x4x2_q8x4x2_4x1(const int n, float * restrict s0, - const void * restrict vx0, const void * restrict vx1, - const void * restrict vx2, const void * restrict vx3, - const void * restrict vy0) { - assert(n % 32 == 0); // min sub-block size - assert((unsigned long) vx0 % 128 == 0); - assert((unsigned long) vx1 % 128 == 0); - assert((unsigned long) vx2 % 128 == 0); - assert((unsigned long) vx3 % 128 == 0); - assert((unsigned long) vy0 % 128 == 0); - - const uint32_t qk = QK_MXFP4x4x2 * 4; - - const uint32_t x_dblk_size = 8 * 4 * 1; // 32x e8m0 - const uint32_t x_qblk_size = qk / 2; // fp4 - const uint32_t x_qrow_size = n / 2; // fp4 (not padded) - - const uint32_t y_dblk_size = 8 * 4 * 2; // 32x __fp16 - const uint32_t y_qblk_size = qk; // int8 - const uint32_t y_qrow_size = n; // int8 (not padded) - - const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0) + 0; // quants first - const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0) + x_qrow_size; // then scales - const uint8_t * restrict r1_x_q = ((const uint8_t *) vx1) + 0; // quants first - const uint8_t * restrict r1_x_d = ((const uint8_t *) vx1) + x_qrow_size; // then scales - const uint8_t * restrict r2_x_q = ((const uint8_t *) vx2) + 0; // quants first - const uint8_t * restrict r2_x_d = ((const uint8_t *) vx2) + x_qrow_size; // then scales - const uint8_t * restrict r3_x_q = ((const uint8_t *) vx3) + 0; // quants first - const uint8_t * restrict r3_x_d = ((const uint8_t *) vx3) + x_qrow_size; // then scales - - const uint8_t * restrict y_q = ((const uint8_t *) vy0) + 0; // quants first - const uint8_t * restrict y_d = ((const uint8_t *) vy0 + y_qrow_size); // then scales - - // Row sum (sf) - HVX_Vector r0_sum = Q6_V_vzero(); - HVX_Vector r1_sum = Q6_V_vzero(); - HVX_Vector r2_sum = Q6_V_vzero(); - HVX_Vector r3_sum = Q6_V_vzero(); - - const uint32_t nb = n / qk; // num full blocks - int32_t nloe = n % qk; // num leftover elemements (must be signed) - - uint32_t i = 0; - for (; i < nb; i++) { - HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_full( y_q + i * y_qblk_size); - HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8_full(r0_x_q + i * x_qblk_size); - HVX_Vector_x8 r1_q = hvx_vec_load_mxfp4x4x8_full(r1_x_q + i * x_qblk_size); - HVX_Vector_x8 r2_q = hvx_vec_load_mxfp4x4x8_full(r2_x_q + i * x_qblk_size); - HVX_Vector_x8 r3_q = hvx_vec_load_mxfp4x4x8_full(r3_x_q + i * x_qblk_size); - - HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q)); - HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q)); - HVX_Vector r2_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r2_q, vy_q)); - HVX_Vector r3_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r3_q, vy_q)); - - HVX_Vector vy_d = *(const HVX_UVector *) (y_d + i * y_dblk_size); - HVX_Vector r0_d = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size); - HVX_Vector r1_d = *(const HVX_UVector *) (r1_x_d + i * x_dblk_size); - HVX_Vector r2_d = *(const HVX_UVector *) (r2_x_d + i * x_dblk_size); - HVX_Vector r3_d = *(const HVX_UVector *) (r3_x_d + i * x_dblk_size); - - // Convert vy_d from fp16 to fp32 while applying 0.5 scaling which is used for e8m0 halving - HVX_Vector half = Q6_Vh_vsplat_R(0x3800); // 0.5 in fp16 - vy_d = Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(vy_d), half)); - vy_d = Q6_Vsf_equals_Vqf32(vy_d); - - // Convert rX_d scales from e8m0 to fp32 - HVX_Vector expand = *(const HVX_Vector *) expand_x32_e8m0; - HVX_Vector e8m0_mask = Q6_V_vsplat_R(0x000000ff); - r0_d = Q6_V_vdelta_VV(r0_d, expand); - r0_d = Q6_V_vand_VV(r0_d, e8m0_mask); - r0_d = Q6_Vw_vasl_VwR(r0_d, 23); - r1_d = Q6_V_vdelta_VV(r1_d, expand); - r1_d = Q6_V_vand_VV(r1_d, e8m0_mask); - r1_d = Q6_Vw_vasl_VwR(r1_d, 23); - r2_d = Q6_V_vdelta_VV(r2_d, expand); - r2_d = Q6_V_vand_VV(r2_d, e8m0_mask); - r2_d = Q6_Vw_vasl_VwR(r2_d, 23); - r3_d = Q6_V_vdelta_VV(r3_d, expand); - r3_d = Q6_V_vand_VV(r3_d, e8m0_mask); - r3_d = Q6_Vw_vasl_VwR(r3_d, 23); - - HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r0_d, vy_d)); - HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r1_d, vy_d)); - HVX_Vector r2_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r2_d, vy_d)); - HVX_Vector r3_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r3_d, vy_d)); - - HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); - HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd); - HVX_Vector r2_fa = Q6_Vqf32_vmpy_VsfVsf(r2_ia, r2_dd); - HVX_Vector r3_fa = Q6_Vqf32_vmpy_VsfVsf(r3_ia, r3_dd); - - r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); - r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum)); - r2_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r2_fa, r2_sum)); - r3_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r3_fa, r3_sum)); - } - - if (nloe) { - HVX_Vector_x8 vy_q = hvx_vec_load_q8x4x8_partial( y_q + i * y_qblk_size, nloe); - HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8_partial(r0_x_q + i * x_qblk_size, nloe); - HVX_Vector_x8 r1_q = hvx_vec_load_mxfp4x4x8_partial(r1_x_q + i * x_qblk_size, nloe); - HVX_Vector_x8 r2_q = hvx_vec_load_mxfp4x4x8_partial(r2_x_q + i * x_qblk_size, nloe); - HVX_Vector_x8 r3_q = hvx_vec_load_mxfp4x4x8_partial(r3_x_q + i * x_qblk_size, nloe); - - HVX_Vector r0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy_q)); - HVX_Vector r1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy_q)); - HVX_Vector r2_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r2_q, vy_q)); - HVX_Vector r3_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r3_q, vy_q)); - - HVX_Vector vy_d = *(const HVX_UVector *) (y_d + i * y_dblk_size); - HVX_Vector r0_d = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size); - HVX_Vector r1_d = *(const HVX_UVector *) (r1_x_d + i * x_dblk_size); - HVX_Vector r2_d = *(const HVX_UVector *) (r2_x_d + i * x_dblk_size); - HVX_Vector r3_d = *(const HVX_UVector *) (r3_x_d + i * x_dblk_size); - - // Convert vy_d from fp16 to fp32 while applying 0.5 scaling which is used for e8m0 halving - HVX_Vector half = Q6_Vh_vsplat_R(0x3800); // 0.5 in fp16 - vy_d = Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(vy_d), half)); - vy_d = Q6_Vsf_equals_Vqf32(vy_d); - - // Convert rX_d scales from e8m0 to fp32 - HVX_Vector expand = *(const HVX_Vector *) expand_x32_e8m0; - HVX_Vector e8m0_mask = Q6_V_vsplat_R(0x000000ff); - r0_d = Q6_V_vdelta_VV(r0_d, expand); - r0_d = Q6_V_vand_VV(r0_d, e8m0_mask); - r0_d = Q6_Vw_vasl_VwR(r0_d, 23); - r1_d = Q6_V_vdelta_VV(r1_d, expand); - r1_d = Q6_V_vand_VV(r1_d, e8m0_mask); - r1_d = Q6_Vw_vasl_VwR(r1_d, 23); - r2_d = Q6_V_vdelta_VV(r2_d, expand); - r2_d = Q6_V_vand_VV(r2_d, e8m0_mask); - r2_d = Q6_Vw_vasl_VwR(r2_d, 23); - r3_d = Q6_V_vdelta_VV(r3_d, expand); - r3_d = Q6_V_vand_VV(r3_d, e8m0_mask); - r3_d = Q6_Vw_vasl_VwR(r3_d, 23); - - HVX_Vector r0_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r0_d, vy_d)); - HVX_Vector r1_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r1_d, vy_d)); - HVX_Vector r2_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r2_d, vy_d)); - HVX_Vector r3_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r3_d, vy_d)); - - // Zero-out unused values - HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8); - r0_dd = Q6_V_vand_QV(bmask, r0_dd); - r1_dd = Q6_V_vand_QV(bmask, r1_dd); - r2_dd = Q6_V_vand_QV(bmask, r2_dd); - r3_dd = Q6_V_vand_QV(bmask, r3_dd); - r0_ia = Q6_V_vand_QV(bmask, r0_ia); - r1_ia = Q6_V_vand_QV(bmask, r1_ia); - r2_ia = Q6_V_vand_QV(bmask, r2_ia); - r3_ia = Q6_V_vand_QV(bmask, r3_ia); - - HVX_Vector r0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_ia, r0_dd); - HVX_Vector r1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_ia, r1_dd); - HVX_Vector r2_fa = Q6_Vqf32_vmpy_VsfVsf(r2_ia, r2_dd); - HVX_Vector r3_fa = Q6_Vqf32_vmpy_VsfVsf(r3_ia, r3_dd); - - r0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_fa, r0_sum)); - r1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_fa, r1_sum)); - r2_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r2_fa, r2_sum)); - r3_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r3_fa, r3_sum)); - } - - HVX_Vector_x4 rsum_in = { .v = { r0_sum, r1_sum, r2_sum, r3_sum } }; - HVX_Vector rsum = hvx_vec_reduce_sum_f32x4(rsum_in); - hvx_vec_store_u(s0, 16, rsum); -} - - -static void vec_dot_mxfp4x4x2_q8x4x2_2x2(const int n, float * restrict s0, float * restrict s1, - const void * restrict vx0, const void * restrict vx1, - const void * restrict vy0, const void * restrict vy1) { - assert(n % 32 == 0); - assert((unsigned long) vx0 % 128 == 0); - assert((unsigned long) vx1 % 128 == 0); - assert((unsigned long) vy0 % 128 == 0); - assert((unsigned long) vy1 % 128 == 0); - - const uint32_t qk = QK_MXFP4x4x2 * 4; +#include "matmul-ops.h" +#include "vtcm-utils.h" + +typedef struct { + float *dst; + const float *activation; + const __fp16 *weight; + int m; + int k; + int n; + int act_stride; + int weight_stride; + int dst_stride; + int ne02; + int ne03; + int ne12; + int ne13; + size_t src0_nb2; + size_t src0_nb3; + size_t src1_nb2; + size_t src1_nb3; + size_t dst_nb2; + size_t dst_nb3; +} hmx_mm_f16_f32_batched_params_t; + +struct htp_mm_context { + const char * type; + struct htp_ops_context * octx; - const uint32_t x_dblk_size = 8 * 4 * 1; // 32x e8m0 - const uint32_t x_qblk_size = qk / 2; // fp4 - const uint32_t x_qrow_size = n / 2; // fp4 (not padded) + void (*vec_dot_1x1)(const uint32_t n, float * restrict s0, + const void * restrict vx0, + const void * restrict vy0); - const uint32_t y_dblk_size = 8 * 4 * 2; // 32x __fp16 - const uint32_t y_qblk_size = qk; // int8 - const uint32_t y_qrow_size = n; // int8 (not padded) + void (*vec_dot_2x1)(const uint32_t n, float * restrict s0, + const void * restrict vx0, const void * restrict vx1, + const void * restrict vy0); - const uint8_t * restrict r0_x_q = ((const uint8_t *) vx0) + 0; // quants first - const uint8_t * restrict r0_x_d = ((const uint8_t *) vx0) + x_qrow_size; // then scales - const uint8_t * restrict r1_x_q = ((const uint8_t *) vx1) + 0; // quants first - const uint8_t * restrict r1_x_d = ((const uint8_t *) vx1) + x_qrow_size; // then scales + void (*vec_dot_2x2)(const uint32_t n, float * restrict s0, float * restrict s1, + const void * restrict vx0, const void * restrict vx1, + const void * restrict vy0, const void * restrict vy1); - const uint8_t * restrict y0_q = ((const uint8_t *) vy0) + 0; // quants first - const uint8_t * restrict y0_d = ((const uint8_t *) vy0) + y_qrow_size; // then scales - const uint8_t * restrict y1_q = ((const uint8_t *) vy1) + 0; // quants first - const uint8_t * restrict y1_d = ((const uint8_t *) vy1) + y_qrow_size; // then scales + void (*vec_dot_32x1)(const uint32_t n, float * restrict s, + const void * restrict vx, + const void * restrict vy, uint32_t valid_rows); - // Row sums (sf) - 4 accumulators for 2×2 tile - HVX_Vector r0_c0_sum = Q6_V_vzero(); - HVX_Vector r0_c1_sum = Q6_V_vzero(); - HVX_Vector r1_c0_sum = Q6_V_vzero(); - HVX_Vector r1_c1_sum = Q6_V_vzero(); + // Precomputed values + uint32_t src0_nrows_per_thread; + uint32_t src1_nrows_per_thread; - const uint32_t nb = n / qk; // num full blocks - const uint32_t nloe = n % qk; // num leftover elements + struct fastdiv_values mm_div_ne12_ne1; + struct fastdiv_values mm_div_ne1; + struct fastdiv_values mm_div_r2; + struct fastdiv_values mm_div_r3; + struct fastdiv_values mm_div_ne11; - uint32_t i = 0; - for (; i < nb; i++) { - // Load src1 columns (reused across both src0 rows) - HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8_full(y0_q + i * y_qblk_size); - HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8_full(y1_q + i * y_qblk_size); + // Precomputed block-parallel quantization values + uint32_t quant_ib_first[MAX_NUM_WORKERS]; + uint32_t quant_ib_last[MAX_NUM_WORKERS]; + uint32_t quant_r[MAX_NUM_WORKERS]; + uint32_t quant_c[MAX_NUM_WORKERS]; - // Load src0 rows (reused across both src1 columns) - HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8_full(r0_x_q + i * x_qblk_size); - HVX_Vector_x8 r1_q = hvx_vec_load_mxfp4x4x8_full(r1_x_q + i * x_qblk_size); + // Fields for scattered mapping & HMX support in MUL_MAT_ID + const uint32_t * matrix_row_counts; + const struct mmid_row_mapping * matrix_rows; - // Compute 4 dot products: r0×c0, r0×c1, r1×c0, r1×c1 - HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy0_q)); - HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r0_q, vy1_q)); - HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy0_q)); - HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_full(r1_q, vy1_q)); - - // Load scales - HVX_Vector vy0_d = *(const HVX_UVector *) (y0_d + i * y_dblk_size); - HVX_Vector vy1_d = *(const HVX_UVector *) (y1_d + i * y_dblk_size); - HVX_Vector r0_d = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size); - HVX_Vector r1_d = *(const HVX_UVector *) (r1_x_d + i * x_dblk_size); - - // Convert vy_d from fp16 to fp32 while applying 0.5 scaling which is used for e8m0 halving - HVX_Vector half = Q6_Vh_vsplat_R(0x3800); // 0.5 in fp16 - vy0_d = Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(vy0_d), half)); - vy0_d = Q6_Vsf_equals_Vqf32(vy0_d); - vy1_d = Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(vy1_d), half)); - vy1_d = Q6_Vsf_equals_Vqf32(vy1_d); - - // Convert rX_d scales from e8m0 to fp32 - // Expand and zero-pad 32x uint8 e8m0 values to uint32s : 0 0 0 0, 0 0 0 1, 0 0 0 2, ... - // Left shift with zero fill to create FP32 - // FIXME: might need to handle zero as a special case (see ggml-cpu code) - HVX_Vector expand = *(const HVX_Vector *) expand_x32_e8m0; - HVX_Vector e8m0_mask = Q6_V_vsplat_R(0x000000ff); - r0_d = Q6_V_vdelta_VV(r0_d, expand); - r0_d = Q6_V_vand_VV(r0_d, e8m0_mask); - r0_d = Q6_Vw_vasl_VwR(r0_d, 23); - r1_d = Q6_V_vdelta_VV(r1_d, expand); - r1_d = Q6_V_vand_VV(r1_d, e8m0_mask); - r1_d = Q6_Vw_vasl_VwR(r1_d, 23); - - // Compute combined scales - HVX_Vector r0_c0_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r0_d, vy0_d)); - HVX_Vector r0_c1_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r0_d, vy1_d)); - HVX_Vector r1_c0_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r1_d, vy0_d)); - HVX_Vector r1_c1_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r1_d, vy1_d)); - - // Apply scales and accumulate - HVX_Vector r0_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c0_ia, r0_c0_dd); - HVX_Vector r0_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c1_ia, r0_c1_dd); - HVX_Vector r1_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c0_ia, r1_c0_dd); - HVX_Vector r1_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c1_ia, r1_c1_dd); - - r0_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c0_fa, r0_c0_sum)); - r0_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c1_fa, r0_c1_sum)); - r1_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c0_fa, r1_c0_sum)); - r1_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c1_fa, r1_c1_sum)); - } + // Dynamic VTCM pointers allocated sequentially + uint8_t * vtcm_src0; + uint8_t * vtcm_src1; + uint8_t * vtcm_src2; + uint8_t * vtcm_src3; + uint8_t * vtcm_dst; + + // Cached strides + uint32_t vtcm_src0_stride; + uint32_t vtcm_src1_stride; + uint32_t vtcm_src2_stride; + uint32_t vtcm_src3_stride; + + // Cached thread offsets/sizes + uint32_t vtcm_src0_size_per_thread; + uint32_t vtcm_src1_size_per_thread; + uint32_t vtcm_src2_size_per_thread; + uint32_t vtcm_src3_size_per_thread; + uint32_t vtcm_dst_size_per_thread; +}; - // Process leftovers - if (nloe) { - HVX_Vector_x8 vy0_q = hvx_vec_load_q8x4x8_partial( y0_q + i * y_qblk_size, nloe); - HVX_Vector_x8 vy1_q = hvx_vec_load_q8x4x8_partial( y1_q + i * y_qblk_size, nloe); - HVX_Vector_x8 r0_q = hvx_vec_load_mxfp4x4x8_partial(r0_x_q + i * x_qblk_size, nloe); - HVX_Vector_x8 r1_q = hvx_vec_load_mxfp4x4x8_partial(r1_x_q + i * x_qblk_size, nloe); - - HVX_Vector r0_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy0_q, nloe)); - HVX_Vector r0_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r0_q, vy1_q, nloe)); - HVX_Vector r1_c0_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy0_q, nloe)); - HVX_Vector r1_c1_ia = Q6_Vsf_equals_Vw(hvx_vec_rmpy_x8_partial(r1_q, vy1_q, nloe)); - - HVX_Vector vy0_d = *(const HVX_UVector *) (y0_d + i * y_dblk_size); - HVX_Vector vy1_d = *(const HVX_UVector *) (y1_d + i * y_dblk_size); - HVX_Vector r0_d = *(const HVX_UVector *) (r0_x_d + i * x_dblk_size); - HVX_Vector r1_d = *(const HVX_UVector *) (r1_x_d + i * x_dblk_size); - - // Convert vy_d from fp16 to fp32 while applying 0.5 scaling which is used for e8m0 halving - HVX_Vector half = Q6_Vh_vsplat_R(0x3800); // 0.5 in fp16 - vy0_d = Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(vy0_d), half)); - vy0_d = Q6_Vsf_equals_Vqf32(vy0_d); - vy1_d = Q6_V_lo_W(Q6_Wqf32_vmpy_VhfVhf(Q6_Vh_vshuff_Vh(vy1_d), half)); - vy1_d = Q6_Vsf_equals_Vqf32(vy1_d); - - // Convert rX_d scales from e8m0 to fp32 - // Expand and zero-pad 32x uint8 e8m0 values to uint32s : 0 0 0 0, 0 0 0 1, 0 0 0 2, ... - // Left shift with zero fill to create FP32 - // FIXME: might need to handle zero as a special case (see ggml-cpu code) - HVX_Vector expand = *(const HVX_Vector *) expand_x32_e8m0; - HVX_Vector e8m0_mask = Q6_V_vsplat_R(0x000000ff); - r0_d = Q6_V_vdelta_VV(r0_d, expand); - r0_d = Q6_V_vand_VV(r0_d, e8m0_mask); - r0_d = Q6_Vw_vasl_VwR(r0_d, 23); - r1_d = Q6_V_vdelta_VV(r1_d, expand); - r1_d = Q6_V_vand_VV(r1_d, e8m0_mask); - r1_d = Q6_Vw_vasl_VwR(r1_d, 23); - - HVX_Vector r0_c0_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r0_d, vy0_d)); - HVX_Vector r0_c1_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r0_d, vy1_d)); - HVX_Vector r1_c0_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r1_d, vy0_d)); - HVX_Vector r1_c1_dd = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(r1_d, vy1_d)); - - // Zero out unused scales - HVX_VectorPred bmask = Q6_Q_vsetq_R(nloe / 8); - r0_c0_dd = Q6_V_vand_QV(bmask, r0_c0_dd); - r0_c1_dd = Q6_V_vand_QV(bmask, r0_c1_dd); - r1_c0_dd = Q6_V_vand_QV(bmask, r1_c0_dd); - r1_c1_dd = Q6_V_vand_QV(bmask, r1_c1_dd); - r0_c0_ia = Q6_V_vand_QV(bmask, r0_c0_ia); - r0_c1_ia = Q6_V_vand_QV(bmask, r0_c1_ia); - r1_c0_ia = Q6_V_vand_QV(bmask, r1_c0_ia); - r1_c1_ia = Q6_V_vand_QV(bmask, r1_c1_ia); - - HVX_Vector r0_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c0_ia, r0_c0_dd); - HVX_Vector r0_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r0_c1_ia, r0_c1_dd); - HVX_Vector r1_c0_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c0_ia, r1_c0_dd); - HVX_Vector r1_c1_fa = Q6_Vqf32_vmpy_VsfVsf(r1_c1_ia, r1_c1_dd); - - r0_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c0_fa, r0_c0_sum)); - r0_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r0_c1_fa, r0_c1_sum)); - r1_c0_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c0_fa, r1_c0_sum)); - r1_c1_sum = Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_Vqf32Vsf(r1_c1_fa, r1_c1_sum)); - } +// vdelta control to expand first 32 e8m0 values into 32 uint32 elements +static const uint8_t __attribute__((aligned(128))) expand_x32_e8m0[128] = { + 0x00, 0x00, 0x00, 0x00, 0x01, 0x04, 0x00, 0x00, 0x02, 0x00, 0x08, 0x08, 0x01, 0x02, 0x00, 0x04, 0x04, 0x00, 0x00, + 0x00, 0x11, 0x10, 0x10, 0x10, 0x02, 0x00, 0x04, 0x00, 0x01, 0x02, 0x08, 0x08, 0x08, 0x08, 0x00, 0x00, 0x01, 0x04, + 0x00, 0x00, 0x22, 0x20, 0x20, 0x20, 0x21, 0x22, 0x20, 0x24, 0x04, 0x00, 0x00, 0x00, 0x09, 0x08, 0x00, 0x00, 0x02, + 0x00, 0x04, 0x00, 0x11, 0x12, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x01, 0x04, 0x00, 0x00, 0x02, 0x00, 0x08, 0x08, + 0x01, 0x02, 0x00, 0x04, 0x44, 0x40, 0x40, 0x40, 0x41, 0x40, 0x40, 0x40, 0x42, 0x40, 0x44, 0x40, 0x41, 0x42, 0x48, + 0x48, 0x08, 0x08, 0x00, 0x00, 0x01, 0x04, 0x00, 0x00, 0x12, 0x10, 0x10, 0x10, 0x01, 0x02, 0x00, 0x04, 0x04, 0x00, + 0x00, 0x00, 0x09, 0x08, 0x00, 0x00, 0x22, 0x20, 0x24, 0x20, 0x21, 0x22, 0x20, 0x20, +}; - // Reduce and store results - HVX_Vector r0_r1_c0_sum = hvx_vec_reduce_sum_f32x2(r0_c0_sum, r1_c0_sum); - HVX_Vector r0_r1_c1_sum = hvx_vec_reduce_sum_f32x2(r0_c1_sum, r1_c1_sum); +// IQ4_NL dequantization LUT: maps 4-bit index (0-15) to int8 kvalue +// kvalues: -127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113 +static const uint8_t __attribute__((aligned(VLEN))) kvalues_iq4nl_lut[] = { + 0x81, 0, 0x98, 0, 0xAD, 0, 0xBF, 0, 0xCF, 0, 0xDD, 0, 0xEA, 0, 0xF6, 0, 0x01, 0, 0x0D, 0, 0x19, 0, 0x26, 0, + 0x35, 0, 0x45, 0, 0x59, 0, 0x71, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; - hvx_vec_store_u(&s0[0], 8, r0_r1_c0_sum); // row0,col0 row1,col0 - hvx_vec_store_u(&s1[0], 8, r0_r1_c1_sum); // row0,col1 row1,col1 -} +static const uint8_t __attribute__((aligned(VLEN))) kvalues_mxfp4_lut[] = { + 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 6, 0, 8, 0, 12, 0, 0, 0, 0xff, 0, 0xfe, 0, 0xfd, 0, 0xfc, 0, + 0xfa, 0, 0xf8, 0, 0xf4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; #if __HVX_ARCH__ < 79 #define HVX_OP_ADD_F32(a, b) Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(a, b)) @@ -2926,7 +141,7 @@ static void vec_dot_mxfp4x4x2_q8x4x2_2x2(const int n, float * restrict s0, float #define HVX_OP_MUL_F32(a, b) Q6_Vsf_vmpy_VsfVsf(a, b) #endif -static void vec_dot_f32_f32_aa_1x1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +static void vec_dot_f32_f32_aa_1x1(const uint32_t n, float * restrict s, const void * restrict vx, const void * restrict vy) { const HVX_Vector * restrict x = (const HVX_Vector *) vx; const HVX_Vector * restrict y = (const HVX_Vector *) vy; @@ -2954,7 +169,7 @@ static void vec_dot_f32_f32_aa_1x1(const int n, float * restrict s, const void * *s = hvx_vec_get_f32(hvx_vec_reduce_sum_f32(rsum)); } -static void vec_dot_f32_f32_aa_2x1(const int n, float * restrict s0, +static void vec_dot_f32_f32_aa_2x1(const uint32_t n, float * restrict s0, const void * restrict vx0, const void * restrict vx1, const void * restrict vy0) { const HVX_Vector * restrict x0 = (const HVX_Vector *) vx0; @@ -2996,7 +211,7 @@ static void vec_dot_f32_f32_aa_2x1(const int n, float * restrict s0, s0[1] = va.fp32[1]; } -static void vec_dot_f32_f32_aa_2x2(const int n, float * restrict s0, float * restrict s1, +static void vec_dot_f32_f32_aa_2x2(const uint32_t n, float * restrict s0, float * restrict s1, const void * restrict vx0, const void * restrict vx1, const void * restrict vy0, const void * restrict vy1) { const HVX_Vector * restrict x0 = (const HVX_Vector *) vx0; @@ -3054,7 +269,7 @@ static void vec_dot_f32_f32_aa_2x2(const int n, float * restrict s0, float * res s1[1] = va1.fp32[1]; } -static void vec_dot_f32_f32_uu_1x1(const int n, float * restrict s, const void * restrict x, const void * restrict y) { +static void vec_dot_f32_f32_uu_1x1(const uint32_t n, float * restrict s, const void * restrict x, const void * restrict y) { const HVX_UVector * restrict vx = (const HVX_UVector * restrict) x; const HVX_UVector * restrict vy = (const HVX_UVector * restrict) y; @@ -3088,7 +303,7 @@ static void vec_dot_f32_f32_uu_1x1(const int n, float * restrict s, const void * hvx_vec_store_u(&s[0], 4, rsum); } -static void vec_dot_f16_f16_aa_1x1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +static void vec_dot_f16_f16_aa_1x1(const uint32_t n, float * restrict s, const void * restrict vx, const void * restrict vy) { const HVX_Vector * restrict x = (const HVX_Vector *) vx; const HVX_Vector * restrict y = (const HVX_Vector *) vy; @@ -3115,7 +330,7 @@ static void vec_dot_f16_f16_aa_1x1(const int n, float * restrict s, const void * hvx_vec_store_u(s, 4, hvx_vec_reduce_sum_f32(rsum)); } -static void vec_dot_f16_f16_aa_2x1(const int n, float * restrict s0, +static void vec_dot_f16_f16_aa_2x1(const uint32_t n, float * restrict s0, const void * restrict vx0, const void * restrict vx1, const void * restrict vy0) { const HVX_Vector * restrict x0 = (const HVX_Vector *) vx0; @@ -3152,7 +367,7 @@ static void vec_dot_f16_f16_aa_2x1(const int n, float * restrict s0, hvx_vec_store_u(s0, 8, rsum); } -static void vec_dot_f16_f16_aa_2x2(const int n, float * restrict s0, float * restrict s1, +static void vec_dot_f16_f16_aa_2x2(const uint32_t n, float * restrict s0, float * restrict s1, const void * restrict vx0, const void * restrict vx1, const void * restrict vy0, const void * restrict vy1) { const HVX_Vector * restrict x0 = (const HVX_Vector *) vx0; @@ -3212,7 +427,7 @@ static void vec_dot_f16_f16_aa_2x2(const int n, float * restrict s0, float * res hvx_vec_store_u(&s1[0], 8, r0_r1_c1_sum); // row0,col1 row1,col1 } -static void vec_dot_f16_f16_uu_1x1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +static void vec_dot_f16_f16_uu_1x1(const uint32_t n, float * restrict s, const void * restrict vx, const void * restrict vy) { const HVX_UVector * restrict x = (const HVX_UVector *) vx; const HVX_UVector * restrict y = (const HVX_UVector *) vy; @@ -3242,7 +457,7 @@ static void vec_dot_f16_f16_uu_1x1(const int n, float * restrict s, const void * hvx_vec_store_u(&s[0], 4, rsum); } -static void vec_dot_f16_f32_uu_1x1(const int n, float * restrict s, const void * restrict x, const void * restrict y) { +static void vec_dot_f16_f32_uu_1x1(const uint32_t n, float * restrict s, const void * restrict x, const void * restrict y) { const HVX_UVector * restrict vx = (const HVX_UVector * restrict) x; const HVX_UVector * restrict vy = (const HVX_UVector * restrict) y; @@ -3295,65 +510,59 @@ static void vec_dot_f16_f32_uu_1x1(const int n, float * restrict s, const void * hvx_vec_store_u(&s[0], 4, rsum); } -#define htp_matmul_tensors_preamble \ - const struct htp_tensor * restrict src0 = octx->src[0]; \ - const struct htp_tensor * restrict src1 = octx->src[1]; \ - const struct htp_tensor * restrict src2 = octx->src[2]; \ - const struct htp_tensor * restrict dst = octx->dst; \ - struct htp_spad * restrict src0_spad = &octx->src0_spad; \ - struct htp_spad * restrict src1_spad = &octx->src1_spad; \ - struct htp_spad * restrict dst_spad = &octx->dst_spad; \ - \ - const uint32_t ne00 = src0->ne[0]; \ - const uint32_t ne01 = src0->ne[1]; \ - const uint32_t ne02 = src0->ne[2]; \ - const uint32_t ne03 = src0->ne[3]; \ - \ - const uint32_t ne10 = src1->ne[0]; \ - const uint32_t ne11 = src1->ne[1]; \ - const uint32_t ne12 = src1->ne[2]; \ - const uint32_t ne13 = src1->ne[3]; \ - \ - const uint32_t ne20 = src2->ne[0]; \ - const uint32_t ne21 = src2->ne[1]; \ - const uint32_t ne22 = src2->ne[2]; \ - const uint32_t ne23 = src2->ne[3]; \ - \ - const uint32_t ne0 = dst->ne[0]; \ - const uint32_t ne1 = dst->ne[1]; \ - const uint32_t ne2 = dst->ne[2]; \ - const uint32_t ne3 = dst->ne[3]; \ - \ - const uint32_t nb00 = src0->nb[0]; \ - const uint32_t nb01 = src0->nb[1]; \ - const uint32_t nb02 = src0->nb[2]; \ - const uint32_t nb03 = src0->nb[3]; \ - \ - const uint32_t nb10 = src1->nb[0]; \ - const uint32_t nb11 = src1->nb[1]; \ - const uint32_t nb12 = src1->nb[2]; \ - const uint32_t nb13 = src1->nb[3]; \ - \ - const uint32_t nb0 = dst->nb[0]; \ - const uint32_t nb1 = dst->nb[1]; \ - const uint32_t nb2 = dst->nb[2]; \ +#define htp_matmul_tensors_preamble \ + const struct htp_tensor * restrict src0 = octx->src[0]; \ + const struct htp_tensor * restrict src1 = octx->src[1]; \ + const struct htp_tensor * restrict src2 = octx->src[2]; \ + const struct htp_tensor * restrict dst = octx->dst; \ + \ + const uint32_t ne00 = src0->ne[0]; \ + const uint32_t ne01 = src0->ne[1]; \ + const uint32_t ne02 = src0->ne[2]; \ + const uint32_t ne03 = src0->ne[3]; \ + \ + const uint32_t ne10 = src1->ne[0]; \ + const uint32_t ne11 = src1->ne[1]; \ + const uint32_t ne12 = src1->ne[2]; \ + const uint32_t ne13 = src1->ne[3]; \ + \ + const uint32_t ne20 = src2->ne[0]; \ + const uint32_t ne21 = src2->ne[1]; \ + const uint32_t ne22 = src2->ne[2]; \ + const uint32_t ne23 = src2->ne[3]; \ + \ + const uint32_t ne0 = dst->ne[0]; \ + const uint32_t ne1 = dst->ne[1]; \ + const uint32_t ne2 = dst->ne[2]; \ + const uint32_t ne3 = dst->ne[3]; \ + \ + const uint32_t nb00 = src0->nb[0]; \ + const uint32_t nb01 = src0->nb[1]; \ + const uint32_t nb02 = src0->nb[2]; \ + const uint32_t nb03 = src0->nb[3]; \ + \ + const uint32_t nb10 = src1->nb[0]; \ + const uint32_t nb11 = src1->nb[1]; \ + const uint32_t nb12 = src1->nb[2]; \ + const uint32_t nb13 = src1->nb[3]; \ + \ + const uint32_t nb0 = dst->nb[0]; \ + const uint32_t nb1 = dst->nb[1]; \ + const uint32_t nb2 = dst->nb[2]; \ const uint32_t nb3 = dst->nb[3]; -#define htp_matmul_preamble \ - struct htp_matmul_context * mmctx = data; \ - struct htp_ops_context * octx = mmctx->octx; \ - htp_matmul_tensors_preamble; \ - dma_queue *dma_queue = octx->ctx->dma[ith]; \ - uint32_t src0_nrows_per_thread = mmctx->src0_nrows_per_thread; +#define htp_matmul_preamble \ + struct htp_mm_context * mmctx = data; \ + struct htp_ops_context * octx = mmctx->octx; \ + dma_queue *dma_queue = octx->ctx->dma[ith]; \ + uint32_t src0_nrows_per_thread = mmctx->src0_nrows_per_thread; \ + htp_matmul_tensors_preamble; // *** matmul with support for 4d tensors and full broadcasting -static void matmul_4d(unsigned int nth, unsigned int ith, void * data) { +static void hvx_mm_4d(unsigned int nth, unsigned int ith, void * data) { htp_matmul_preamble; - uint64_t t1, t2; - t1 = HAP_perf_get_qtimer_count(); - assert(ne12 % ne02 == 0); assert(ne13 % ne03 == 0); @@ -3387,7 +596,9 @@ static void matmul_4d(unsigned int nth, unsigned int ith, void * data) { return; } - // block-tiling attempt + struct htp_thread_trace * tr = octx->ctx ? &octx->ctx->trace[ith] : NULL; + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, ir0_start); + const uint32_t blck_0 = 64; const uint32_t blck_1 = 64; @@ -3419,18 +630,599 @@ static void matmul_4d(unsigned int nth, unsigned int ith, void * data) { } } - t2 = HAP_perf_get_qtimer_count(); + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, ir0_start); +} + +#include "hmx-mm-kernels-tiled.h" +#include "hvx-mm-kernels-tiled.h" +#include "hvx-mm-kernels-flat.h" + +// Specialized repacked matmul macros +#define MATMUL_2D_REPACKED_IMPL(SUFFIX, TILE_SIZE, DOT_2X2, DOT_2X1) \ +static void hvx_mm_2d_repacked_##SUFFIX(unsigned int nth, unsigned int ith, void * data) { \ + htp_matmul_preamble; \ + \ + const uint32_t src0_nrows = ne01 * ne02 * ne03; \ + const uint32_t src1_nrows = ne11 * ne12 * ne13; \ + \ + const uint32_t src0_start_row = src0_nrows_per_thread * ith; \ + const uint32_t src0_end_row = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows); \ + \ + if (src0_start_row >= src0_end_row) { \ + return; \ + } \ + \ + struct htp_thread_trace * tr = octx->ctx ? &octx->ctx->trace[ith] : NULL; \ + \ + const struct htp_mm_kernel_params * kparams = (const struct htp_mm_kernel_params *) octx->kernel_params; \ + const uint32_t n_prefetch = kparams->n_prefetch; \ + assert(n_prefetch >= 2 && n_prefetch <= HTP_MM_MAX_PREFETCH && (n_prefetch & (n_prefetch - 1)) == 0); \ + \ + const size_t dst_row_size = nb1; \ + const size_t src1_row_size = nb11; \ + const size_t src1_stride = mmctx->vtcm_src1_stride; \ + \ + uint8_t * restrict vtcm_dst_ptr = mmctx->vtcm_dst + mmctx->vtcm_dst_size_per_thread * ith; \ + uint8_t * restrict vtcm_src0_ptr = mmctx->vtcm_src0 + mmctx->vtcm_src0_size_per_thread * ith; \ + uint8_t * restrict src1_data = mmctx->vtcm_src1; \ + \ + const uint8_t * restrict src0_row = (const uint8_t *) src0->data; \ + \ + const uint32_t tile_size = TILE_SIZE; \ + const uint32_t aligned_tile_size = hex_align_up(tile_size, 128); \ + \ + uint32_t n_k_tiles_w = ne00 / 32; \ + uint32_t n_k_tiles_a = ne10 / 32; \ + uint32_t tile_row_stride = n_k_tiles_w * tile_size; \ + uint32_t tile_row_transfer_size_aligned = n_k_tiles_a * aligned_tile_size; \ + \ + uint32_t ct_start = src0_start_row / 32; \ + uint32_t ct_end = (src0_end_row + 31) / 32; \ + \ + uint32_t push_ct = ct_start; \ + for (uint32_t d = 0; d < n_prefetch && push_ct < ct_end; d++, push_ct++) { \ + dma_queue_push(dma_queue, dma_make_ptr(vtcm_src0_ptr + d * tile_row_transfer_size_aligned, \ + src0_row + push_ct * tile_row_stride), aligned_tile_size, tile_size, tile_size, n_k_tiles_a); \ + } \ + \ + for (uint32_t ct = ct_start; ct < ct_end; ct++) { \ + const uint8_t * w_tile = dma_queue_pop(dma_queue).dst; \ + \ + int valid_rows = (int)ne0 - (int)(ct * 32); \ + valid_rows = MIN(32, MAX(0, valid_rows)); \ + \ + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, ct); \ + uint32_t ir1 = 0; \ + for (; ir1 + 1 < src1_nrows; ir1 += 2) { \ + const uint8_t * restrict src1_col0 = (const uint8_t *) (src1_data + (ir1+0) * src1_stride); \ + const uint8_t * restrict src1_col1 = (const uint8_t *) (src1_data + (ir1+1) * src1_stride); \ + float * restrict dst_row0 = (float *) (dst->data + ((ir1+0) * dst_row_size)); \ + float * restrict dst_row1 = (float *) (dst->data + ((ir1+1) * dst_row_size)); \ + \ + float * dst_ptr0 = &dst_row0[ct * 32]; \ + float * dst_ptr1 = &dst_row1[ct * 32]; \ + \ + DOT_2X2(ne10, dst_ptr0, dst_ptr1, w_tile, src1_col0, src1_col1, valid_rows); \ + } \ + \ + for (; ir1 < src1_nrows; ++ir1) { \ + const uint8_t * restrict src1_col = (const uint8_t *) (src1_data + ir1 * src1_stride); \ + float * restrict dst_row = (float *) (dst->data + (ir1 * dst_row_size)); \ + float * dst_ptr = &dst_row[ct * 32]; \ + \ + DOT_2X1(ne10, dst_ptr, w_tile, src1_col, valid_rows); \ + } \ + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, ct); \ + \ + if (push_ct < ct_end) { \ + dma_queue_push(dma_queue, dma_make_ptr((uint8_t *)w_tile, src0_row + push_ct * tile_row_stride), \ + aligned_tile_size, tile_size, tile_size, n_k_tiles_a); \ + push_ct++; \ + } \ + } \ +} + +#define MATVEC_2D_REPACKED_IMPL(SUFFIX, TILE_SIZE, DOT_2X1) \ +static void hvx_mv_2d_repacked_##SUFFIX(unsigned int nth, unsigned int ith, void * data) { \ + htp_matmul_preamble; \ + \ + const uint32_t src0_nrows = ne01; \ + \ + const uint32_t src0_start_row = src0_nrows_per_thread * ith; \ + const uint32_t src0_end_row = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows); \ + \ + if (src0_start_row >= src0_end_row) { \ + return; \ + } \ + \ + struct htp_thread_trace * tr = octx->ctx ? &octx->ctx->trace[ith] : NULL; \ + \ + const struct htp_mm_kernel_params * kparams = (const struct htp_mm_kernel_params *) octx->kernel_params; \ + const uint32_t n_prefetch = kparams->n_prefetch; \ + assert(n_prefetch >= 2 && n_prefetch <= HTP_MM_MAX_PREFETCH && (n_prefetch & (n_prefetch - 1)) == 0); \ + \ + const size_t dst_row_size = nb1; \ + const size_t src1_row_size = nb11; \ + const size_t src1_stride = mmctx->vtcm_src1_stride; \ + \ + uint8_t * vtcm_dst_ptr = mmctx->vtcm_dst + mmctx->vtcm_dst_size_per_thread * ith; \ + uint8_t * vtcm_src0_ptr = mmctx->vtcm_src0 + mmctx->vtcm_src0_size_per_thread * ith; \ + uint8_t * src1_data = mmctx->vtcm_src1; \ + \ + float * tmp = (float *) vtcm_dst_ptr; \ + \ + const uint8_t * restrict src0_row = (const uint8_t *) src0->data; \ + const uint8_t * restrict src1_col = (const uint8_t *) src1_data; \ + float * restrict dst_col = (float *) dst->data; \ + \ + const uint32_t tile_size = TILE_SIZE; \ + const uint32_t aligned_tile_size = hex_align_up(tile_size, 128); \ + \ + uint32_t n_k_tiles_w = ne00 / 32; \ + uint32_t n_k_tiles_a = ne10 / 32; \ + uint32_t tile_row_stride = n_k_tiles_w * tile_size; \ + uint32_t tile_row_transfer_size_aligned = n_k_tiles_a * aligned_tile_size; \ + \ + uint32_t ct_start = src0_start_row / 32; \ + uint32_t ct_end = (src0_end_row + 31) / 32; \ + \ + uint32_t push_ct = ct_start; \ + for (uint32_t d = 0; d < n_prefetch && push_ct < ct_end; d++, push_ct++) { \ + dma_queue_push(dma_queue, dma_make_ptr(vtcm_src0_ptr + d * tile_row_transfer_size_aligned, \ + src0_row + push_ct * tile_row_stride), aligned_tile_size, tile_size, tile_size, n_k_tiles_a); \ + } \ + \ + for (uint32_t ct = ct_start; ct < ct_end; ct++) { \ + const uint8_t * w_tile = dma_queue_pop(dma_queue).dst; \ + \ + float * dst_ptr = &tmp[ct * 32 - src0_start_row]; \ + int valid_rows = (int)ne0 - (int)(ct * 32); \ + valid_rows = MIN(32, MAX(0, valid_rows)); \ + \ + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, ct); \ + DOT_2X1(ne10, dst_ptr, w_tile, src1_col, valid_rows); \ + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, ct); \ + \ + if (push_ct < ct_end) { \ + dma_queue_push(dma_queue, dma_make_ptr((uint8_t *)w_tile, src0_row + push_ct * tile_row_stride), \ + aligned_tile_size, tile_size, tile_size, n_k_tiles_a); \ + push_ct++; \ + } \ + } \ + \ + int copy_cnt = (int)MIN(src0_end_row, ne0) - (int)src0_start_row; \ + if (copy_cnt > 0) { \ + hvx_copy_f32_ua((uint8_t *) &dst_col[src0_start_row], (uint8_t *) tmp, copy_cnt); \ + } \ +} + +#define MATMUL_QKV_2D_REPACKED_IMPL(SUFFIX, TILE_SIZE, DOT_2X2, DOT_2X1) \ +static void hvx_mm_qkv_2d_repacked_##SUFFIX(unsigned int nth, unsigned int ith, void * data) { \ + struct htp_mm_context * mmctx = data; \ + struct htp_ops_context * octx = mmctx->octx; \ + \ + const struct htp_tensor * restrict src0 = octx->src[0]; /* Wk */ \ + const struct htp_tensor * restrict src1 = octx->src[1]; /* x */ \ + const struct htp_tensor * restrict src2 = octx->src[2]; /* Wv */ \ + const struct htp_tensor * restrict src3 = octx->src[3]; /* Wq */ \ + const struct htp_tensor * restrict dst_k = octx->dsts[0]; \ + const struct htp_tensor * restrict dst_v = octx->dsts[1]; \ + const struct htp_tensor * restrict dst_q = octx->dsts[2]; \ + \ + const uint32_t ne00 = src0->ne[0]; \ + const uint32_t ne10 = src1->ne[0]; \ + const uint32_t src1_nrows = src1->ne[1] * src1->ne[2] * src1->ne[3]; \ + \ + const size_t dst_k_row_size = dst_k->nb[1]; /* K and V share output width */ \ + const size_t dst_q_row_size = dst_q->nb[1]; /* Q may be wider (GQA) */ \ + const size_t src1_stride = mmctx->vtcm_src1_stride; \ + \ + uint8_t * restrict vtcm_src0_ptr = mmctx->vtcm_src0 + mmctx->vtcm_src0_size_per_thread * ith; \ + uint8_t * restrict vtcm_src2_ptr = mmctx->vtcm_src2 + mmctx->vtcm_src2_size_per_thread * ith; \ + uint8_t * restrict vtcm_src3_ptr = mmctx->vtcm_src3 + mmctx->vtcm_src3_size_per_thread * ith; \ + uint8_t * restrict src1_data = mmctx->vtcm_src1; \ + \ + struct htp_thread_trace * tr = octx->ctx ? &octx->ctx->trace[ith] : NULL; \ + \ + const struct htp_mm_kernel_params * kparams = (const struct htp_mm_kernel_params *) octx->kernel_params; \ + const uint32_t n_prefetch = kparams->n_prefetch; \ + assert(n_prefetch >= 2 && n_prefetch <= HTP_MM_MAX_PREFETCH && (n_prefetch & (n_prefetch - 1)) == 0); \ + \ + const uint8_t * restrict src0_row = (const uint8_t *) src0->data; \ + const uint8_t * restrict src2_row = (const uint8_t *) src2->data; \ + const uint8_t * restrict src3_row = (const uint8_t *) src3->data; \ + \ + const uint32_t tile_size = TILE_SIZE; \ + const uint32_t aligned_tile_size = hex_align_up(tile_size, 128); \ + \ + uint32_t n_k_tiles_w = ne00 / 32; \ + uint32_t n_k_tiles_a = ne10 / 32; \ + uint32_t tile_row_stride = n_k_tiles_w * tile_size; \ + uint32_t tile_row_transfer_size_aligned = n_k_tiles_a * aligned_tile_size; \ + \ + dma_queue * dma_queue = octx->ctx->dma[ith]; \ + \ + /* 1. Process K and V together */ \ + const uint32_t src0_nrows_kv = src0->ne[1] * src0->ne[2] * src0->ne[3]; /* src0 is Wk */ \ + uint32_t src0_nrows_per_thread_kv = (src0_nrows_kv + nth - 1) / nth; \ + src0_nrows_per_thread_kv = hex_round_up(src0_nrows_per_thread_kv, 32); \ + \ + const uint32_t start_row_kv = src0_nrows_per_thread_kv * ith; \ + const uint32_t end_row_kv = MIN(start_row_kv + src0_nrows_per_thread_kv, src0_nrows_kv); \ + \ + if (start_row_kv < end_row_kv) { \ + uint32_t ct_start_kv = start_row_kv / 32; \ + uint32_t ct_end_kv = (end_row_kv + 31) / 32; \ + \ + uint32_t push_ct = ct_start_kv; \ + for (uint32_t d = 0; d < n_prefetch && push_ct < ct_end_kv; d++, push_ct++) { \ + dma_queue_push(dma_queue, dma_make_ptr(vtcm_src0_ptr + d * tile_row_transfer_size_aligned, \ + src0_row + push_ct * tile_row_stride), aligned_tile_size, tile_size, tile_size, n_k_tiles_a); \ + dma_queue_push(dma_queue, dma_make_ptr(vtcm_src2_ptr + d * tile_row_transfer_size_aligned, \ + src2_row + push_ct * tile_row_stride), aligned_tile_size, tile_size, tile_size, n_k_tiles_a); \ + } \ + \ + for (uint32_t ct = ct_start_kv; ct < ct_end_kv; ct++) { \ + const uint8_t * w_tile_k = dma_queue_pop(dma_queue).dst; \ + const uint8_t * w_tile_v = dma_queue_pop(dma_queue).dst; \ + \ + int valid_rows = (int)src0->ne[1] - (int)(ct * 32); \ + valid_rows = MIN(32, MAX(0, valid_rows)); \ + \ + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, ith); \ + uint32_t ir1 = 0; \ + for (; ir1 + 1 < src1_nrows; ir1 += 2) { \ + const uint8_t * restrict src1_col0 = (const uint8_t *) (src1_data + (ir1+0) * src1_stride); \ + const uint8_t * restrict src1_col1 = (const uint8_t *) (src1_data + (ir1+1) * src1_stride); \ + \ + float * restrict dst_row0_k = (float *) (dst_k->data + ((ir1+0) * dst_k_row_size)); \ + float * restrict dst_row1_k = (float *) (dst_k->data + ((ir1+1) * dst_k_row_size)); \ + float * dst_ptr0_k = &dst_row0_k[ct * 32]; \ + float * dst_ptr1_k = &dst_row1_k[ct * 32]; \ + \ + float * restrict dst_row0_v = (float *) (dst_v->data + ((ir1+0) * dst_k_row_size)); \ + float * restrict dst_row1_v = (float *) (dst_v->data + ((ir1+1) * dst_k_row_size)); \ + float * dst_ptr0_v = &dst_row0_v[ct * 32]; \ + float * dst_ptr1_v = &dst_row1_v[ct * 32]; \ + \ + DOT_2X2(ne10, dst_ptr0_k, dst_ptr1_k, w_tile_k, src1_col0, src1_col1, valid_rows); \ + DOT_2X2(ne10, dst_ptr0_v, dst_ptr1_v, w_tile_v, src1_col0, src1_col1, valid_rows); \ + } \ + \ + for (; ir1 < src1_nrows; ++ir1) { \ + const uint8_t * restrict src1_col = (const uint8_t *) (src1_data + ir1 * src1_stride); \ + \ + float * restrict dst_row_k = (float *) (dst_k->data + (ir1 * dst_k_row_size)); \ + float * dst_ptr_k = &dst_row_k[ct * 32]; \ + \ + float * restrict dst_row_v = (float *) (dst_v->data + (ir1 * dst_k_row_size)); \ + float * dst_ptr_v = &dst_row_v[ct * 32]; \ + \ + DOT_2X1(ne10, dst_ptr_k, w_tile_k, src1_col, valid_rows); \ + DOT_2X1(ne10, dst_ptr_v, w_tile_v, src1_col, valid_rows); \ + } \ + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, ith); \ + \ + if (push_ct < ct_end_kv) { \ + dma_queue_push(dma_queue, dma_make_ptr((uint8_t *)w_tile_k, src0_row + push_ct * tile_row_stride), \ + aligned_tile_size, tile_size, tile_size, n_k_tiles_a); \ + dma_queue_push(dma_queue, dma_make_ptr((uint8_t *)w_tile_v, src2_row + push_ct * tile_row_stride), \ + aligned_tile_size, tile_size, tile_size, n_k_tiles_a); \ + push_ct++; \ + } \ + } \ + } \ + \ + /* 2. Process Q separately */ \ + const uint32_t src0_nrows_q = src3->ne[1] * src3->ne[2] * src3->ne[3]; /* src3 is Wq */ \ + uint32_t src0_nrows_per_thread_q = (src0_nrows_q + nth - 1) / nth; \ + src0_nrows_per_thread_q = hex_round_up(src0_nrows_per_thread_q, 32); \ + \ + const uint32_t start_row_q = src0_nrows_per_thread_q * ith; \ + const uint32_t end_row_q = MIN(start_row_q + src0_nrows_per_thread_q, src0_nrows_q); \ + \ + if (start_row_q < end_row_q) { \ + uint32_t ct_start_q = start_row_q / 32; \ + uint32_t ct_end_q = (end_row_q + 31) / 32; \ + \ + uint32_t push_ct = ct_start_q; \ + for (uint32_t d = 0; d < n_prefetch && push_ct < ct_end_q; d++, push_ct++) { \ + dma_queue_push(dma_queue, dma_make_ptr(vtcm_src3_ptr + d * tile_row_transfer_size_aligned, \ + src3_row + push_ct * tile_row_stride), aligned_tile_size, tile_size, tile_size, n_k_tiles_a); \ + } \ + \ + for (uint32_t ct = ct_start_q; ct < ct_end_q; ct++) { \ + const uint8_t * w_tile_q = dma_queue_pop(dma_queue).dst; \ + \ + int valid_rows = (int)src3->ne[1] - (int)(ct * 32); \ + valid_rows = MIN(32, MAX(0, valid_rows)); \ + \ + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, ct); \ + uint32_t ir1 = 0; \ + for (; ir1 + 1 < src1_nrows; ir1 += 2) { \ + const uint8_t * restrict src1_col0 = (const uint8_t *) (src1_data + (ir1+0) * src1_stride); \ + const uint8_t * restrict src1_col1 = (const uint8_t *) (src1_data + (ir1+1) * src1_stride); \ + \ + float * restrict dst_row0_q = (float *) (dst_q->data + ((ir1+0) * dst_q_row_size)); \ + float * restrict dst_row1_q = (float *) (dst_q->data + ((ir1+1) * dst_q_row_size)); \ + float * dst_ptr0_q = &dst_row0_q[ct * 32]; \ + float * dst_ptr1_q = &dst_row1_q[ct * 32]; \ + \ + DOT_2X2(ne10, dst_ptr0_q, dst_ptr1_q, w_tile_q, src1_col0, src1_col1, valid_rows); \ + } \ + \ + for (; ir1 < src1_nrows; ++ir1) { \ + const uint8_t * restrict src1_col = (const uint8_t *) (src1_data + ir1 * src1_stride); \ + \ + float * restrict dst_row_q = (float *) (dst_q->data + (ir1 * dst_q_row_size)); \ + float * dst_ptr_q = &dst_row_q[ct * 32]; \ + \ + DOT_2X1(ne10, dst_ptr_q, w_tile_q, src1_col, valid_rows); \ + } \ + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, ct); \ + \ + if (push_ct < ct_end_q) { \ + dma_queue_push(dma_queue, dma_make_ptr((uint8_t *)w_tile_q, src3_row + push_ct * tile_row_stride), \ + aligned_tile_size, tile_size, tile_size, n_k_tiles_a); \ + push_ct++; \ + } \ + } \ + } \ +} + +#define MATMUL_FFN_2D_REPACKED_IMPL(SUFFIX, TILE_SIZE, DOT_2X2, DOT_2X1) \ +static void hvx_mm_ffn_2d_repacked_##SUFFIX(unsigned int nth, unsigned int ith, void * data) { \ + struct htp_mm_context * mmctx = data; \ + struct htp_ops_context * octx = mmctx->octx; \ + \ + const struct htp_tensor * restrict src0 = octx->src[0]; /* Wgate */ \ + const struct htp_tensor * restrict src1 = octx->src[1]; /* y */ \ + const struct htp_tensor * restrict src2 = octx->src[2]; /* Wup */ \ + const struct htp_tensor * restrict dst_gate = octx->dsts[0]; \ + const struct htp_tensor * restrict dst_up = octx->dsts[1]; \ + \ + const uint32_t ne00 = src0->ne[0]; \ + const uint32_t ne01 = src0->ne[1]; \ + const uint32_t ne10 = src1->ne[0]; \ + const uint32_t src1_nrows = src1->ne[1] * src1->ne[2] * src1->ne[3]; \ + \ + const size_t dst_row_size = dst_gate->nb[1]; \ + const size_t src1_stride = mmctx->vtcm_src1_stride; \ + \ + uint8_t * restrict vtcm_src0_ptr = mmctx->vtcm_src0 + mmctx->vtcm_src0_size_per_thread * ith; \ + uint8_t * restrict vtcm_src2_ptr = mmctx->vtcm_src2 + mmctx->vtcm_src2_size_per_thread * ith; \ + uint8_t * restrict src1_data = mmctx->vtcm_src1; \ + \ + struct htp_thread_trace * tr = octx->ctx ? &octx->ctx->trace[ith] : NULL; \ + \ + const uint8_t * restrict src0_row = (const uint8_t *) src0->data; \ + const uint8_t * restrict src2_row = (const uint8_t *) src2->data; \ + \ + const uint32_t tile_size = TILE_SIZE; \ + const uint32_t aligned_tile_size = hex_align_up(tile_size, 128); \ + \ + const struct htp_mm_kernel_params * kparams = (const struct htp_mm_kernel_params *) octx->kernel_params; \ + const uint32_t n_prefetch = kparams->n_prefetch; \ + assert(n_prefetch >= 2 && n_prefetch <= HTP_MM_MAX_PREFETCH && (n_prefetch & (n_prefetch - 1)) == 0); \ + \ + uint32_t n_k_tiles_w = ne00 / 32; \ + uint32_t n_k_tiles_a = ne10 / 32; \ + uint32_t tile_row_stride = n_k_tiles_w * tile_size; \ + uint32_t tile_row_transfer_size_aligned = n_k_tiles_a * aligned_tile_size; \ + dma_queue * dma_queue = octx->ctx->dma[ith]; \ + \ + const uint32_t src0_nrows = ne01 * src0->ne[2] * src0->ne[3]; \ + const uint32_t src0_start_row = mmctx->src0_nrows_per_thread * ith; \ + const uint32_t src0_end_row = MIN(src0_start_row + mmctx->src0_nrows_per_thread, src0_nrows); \ + \ + uint32_t ct_start = src0_start_row / 32; \ + uint32_t ct_end = (src0_end_row + 31) / 32; \ + \ + uint32_t push_ct = ct_start; \ + for (uint32_t d = 0; d < n_prefetch && push_ct < ct_end; d++, push_ct++) { \ + dma_queue_push(dma_queue, dma_make_ptr(vtcm_src0_ptr + d * tile_row_transfer_size_aligned, \ + src0_row + push_ct * tile_row_stride), aligned_tile_size, tile_size, tile_size, n_k_tiles_a); \ + dma_queue_push(dma_queue, dma_make_ptr(vtcm_src2_ptr + d * tile_row_transfer_size_aligned, \ + src2_row + push_ct * tile_row_stride), aligned_tile_size, tile_size, tile_size, n_k_tiles_a); \ + } \ + \ + for (uint32_t ct = ct_start; ct < ct_end; ct++) { \ + const uint8_t * w_tile_gate = dma_queue_pop(dma_queue).dst; \ + const uint8_t * w_tile_up = dma_queue_pop(dma_queue).dst; \ + \ + int valid_rows = (int)ne01 - (int)(ct * 32); \ + valid_rows = MIN(32, MAX(0, valid_rows)); \ + \ + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, ct); \ + uint32_t ir1 = 0; \ + for (; ir1 + 1 < src1_nrows; ir1 += 2) { \ + const uint8_t * restrict src1_col0 = (const uint8_t *) (src1_data + (ir1+0) * src1_stride); \ + const uint8_t * restrict src1_col1 = (const uint8_t *) (src1_data + (ir1+1) * src1_stride); \ + \ + float * restrict dst_row0_gate = (float *) (dst_gate->data + ((ir1+0) * dst_row_size)); \ + float * restrict dst_row1_gate = (float *) (dst_gate->data + ((ir1+1) * dst_row_size)); \ + float * dst_ptr0_gate = &dst_row0_gate[ct * 32]; \ + float * dst_ptr1_gate = &dst_row1_gate[ct * 32]; \ + \ + float * restrict dst_row0_up = (float *) (dst_up->data + ((ir1+0) * dst_row_size)); \ + float * restrict dst_row1_up = (float *) (dst_up->data + ((ir1+1) * dst_row_size)); \ + float * dst_ptr0_up = &dst_row0_up[ct * 32]; \ + float * dst_ptr1_up = &dst_row1_up[ct * 32]; \ + \ + DOT_2X2(ne10, dst_ptr0_gate, dst_ptr1_gate, w_tile_gate, src1_col0, src1_col1, valid_rows); \ + DOT_2X2(ne10, dst_ptr0_up, dst_ptr1_up, w_tile_up, src1_col0, src1_col1, valid_rows); \ + } \ + \ + for (; ir1 < src1_nrows; ++ir1) { \ + const uint8_t * restrict src1_col = (const uint8_t *) (src1_data + ir1 * src1_stride); \ + \ + float * restrict dst_row_gate = (float *) (dst_gate->data + (ir1 * dst_row_size)); \ + float * dst_ptr_gate = &dst_row_gate[ct * 32]; \ + \ + float * restrict dst_row_up = (float *) (dst_up->data + (ir1 * dst_row_size)); \ + float * dst_ptr_up = &dst_row_up[ct * 32]; \ + \ + DOT_2X1(ne10, dst_ptr_gate, w_tile_gate, src1_col, valid_rows); \ + DOT_2X1(ne10, dst_ptr_up, w_tile_up, src1_col, valid_rows); \ + } \ + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, ct); \ + \ + if (push_ct < ct_end) { \ + dma_queue_push(dma_queue, dma_make_ptr((uint8_t *)w_tile_gate, src0_row + push_ct * tile_row_stride), \ + aligned_tile_size, tile_size, tile_size, n_k_tiles_a); \ + dma_queue_push(dma_queue, dma_make_ptr((uint8_t *)w_tile_up, src2_row + push_ct * tile_row_stride), \ + aligned_tile_size, tile_size, tile_size, n_k_tiles_a); \ + push_ct++; \ + } \ + } \ +} + +MATMUL_2D_REPACKED_IMPL(q4_0, 576, tiled_vec_dot_q4_0_32x2, tiled_vec_dot_q4_0_32x1) +MATMUL_2D_REPACKED_IMPL(q4_1, 640, tiled_vec_dot_q4_1_32x2, tiled_vec_dot_q4_1_32x1) +MATMUL_2D_REPACKED_IMPL(q8_0, 1088, tiled_vec_dot_q8_0_32x2, tiled_vec_dot_q8_0_32x1) +MATMUL_2D_REPACKED_IMPL(iq4nl, 576, tiled_vec_dot_iq4nl_32x2, tiled_vec_dot_iq4nl_32x1) +MATMUL_2D_REPACKED_IMPL(mxfp4, 544, tiled_vec_dot_mxfp4_32x2, tiled_vec_dot_mxfp4_32x1) + +MATMUL_2D_REPACKED_IMPL(q4_0_flat, 576, flat_vec_dot_q4_0_32x2, flat_vec_dot_q4_0_32x1) +MATMUL_2D_REPACKED_IMPL(q4_1_flat, 640, flat_vec_dot_q4_1_32x2, flat_vec_dot_q4_1_32x1) +MATMUL_2D_REPACKED_IMPL(q8_0_flat, 1088, flat_vec_dot_q8_0_32x2, flat_vec_dot_q8_0_32x1) +MATMUL_2D_REPACKED_IMPL(iq4nl_flat, 576, flat_vec_dot_iq4nl_32x2, flat_vec_dot_iq4nl_32x1) +MATMUL_2D_REPACKED_IMPL(mxfp4_flat, 544, flat_vec_dot_mxfp4_32x2, flat_vec_dot_mxfp4_32x1) + +#define QUANTIZE_IMPL(name, log_name, kernel_fn, dst_row_size_expr) \ +static void name(unsigned int nth, unsigned int ith, void * data) { \ + struct htp_mm_context * mmctx = data; \ + struct htp_ops_context * octx = mmctx->octx; \ + const struct htp_tensor * src = octx->src[1]; \ + const uint32_t ne0 = src->ne[0]; \ + const uint32_t ne1 = src->ne[1]; \ + const uint32_t ne2 = src->ne[2]; \ + const uint32_t ne3 = src->ne[3]; \ + const uint32_t nrows = ne1 * ne2 * ne3; \ + const uint32_t nrows_per_thread = mmctx->src1_nrows_per_thread; \ + \ + const uint32_t ir_first = nrows_per_thread * ith; \ + if (ir_first >= nrows) { \ + return; \ + } \ + \ + struct htp_thread_trace * tr = octx->ctx ? &octx->ctx->trace[ith] : NULL; \ + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_A_QUANT, ir_first); \ + \ + uint8_t * restrict dst = mmctx->vtcm_src1; \ + const uint32_t ir_last = MIN(ir_first + nrows_per_thread, nrows); \ + const size_t src_row_size = src->nb[1]; \ + const size_t dst_row_size = (dst_row_size_expr); \ + const uint8_t * restrict src_data = (const uint8_t *) src->data + (src_row_size * ir_first); \ + uint8_t * restrict dst_data = (uint8_t *) dst + (dst_row_size * ir_first); \ + uint8_t * restrict tmp_data = (uint8_t *) mmctx->vtcm_src0 + (mmctx->vtcm_src0_size_per_thread * ith); \ + kernel_fn(src_data, dst_data, tmp_data, ne0, ir_last - ir_first, src_row_size, dst_row_size); \ + \ + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_A_QUANT, ir_first); \ +} + +QUANTIZE_IMPL(quantize_f32_q8_0_tiled, "quantize-f32-q8_0_tiled", quantize_f32_q8_0_tiled_kernel, htp_mm_q8_0_tiled_row_size(ne0)) +QUANTIZE_IMPL(quantize_f32_q8_1_tiled, "quantize-f32-q8_1_tiled", quantize_f32_q8_1_tiled_kernel, htp_mm_q8_1_tiled_row_size(ne0)) +QUANTIZE_IMPL(quantize_f32_q8_0_flat, "quantize-f32-q8_0_flat", quantize_f32_q8_0_flat_kernel, htp_mm_q8_0_flat_row_size(ne0)) +QUANTIZE_IMPL(quantize_f32_q8_1_flat, "quantize-f32-q8_1_flat", quantize_f32_q8_1_flat_kernel, htp_mm_q8_1_flat_row_size(ne0)) +QUANTIZE_IMPL(quantize_f32_f32_flat, "quantize-f32-f32", quantize_f32_f32_flat_kernel, mmctx->vtcm_src1_stride) +QUANTIZE_IMPL(quantize_f32_f16_flat, "quantize-f32-f16", quantize_f32_f16_flat_kernel, mmctx->vtcm_src1_stride) +QUANTIZE_IMPL(quantize_f16_f16_flat, "quantize-f16-f16", quantize_f16_f16_flat_kernel, mmctx->vtcm_src1_stride) + +static void quantize_f32_q8_0_tiled_block(unsigned int nth, unsigned int ith, void * data) { + struct htp_mm_context * mmctx = data; + struct htp_ops_context * octx = mmctx->octx; + struct htp_thread_trace * tr = octx->ctx ? &octx->ctx->trace[ith] : NULL; + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_A_QUANT, mmctx->quant_ib_first[ith]); + + const struct htp_tensor * src = octx->src[1]; + + quantize_f32_q8_0_tiled_block_kernel( + (const float *) src->data, + mmctx->vtcm_src1, + (uint8_t *) mmctx->vtcm_src0 + (mmctx->vtcm_src0_size_per_thread * ith), + src->ne[0], + mmctx->quant_ib_first[ith], + mmctx->quant_ib_last[ith], + src->nb[1], + htp_mm_q8_0_tiled_row_size(src->ne[0]), + mmctx->quant_r[ith], + mmctx->quant_c[ith] + ); - FARF(HIGH, "matmul-4d %d/%d: %ux%ux%ux%u (%u:%u %u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", ith, nth, - src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], ir0_start, ir0_end, ir1_start, ir1_end, src1->ne[0], - src1->ne[1], src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], - (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_A_QUANT, mmctx->quant_ib_first[ith]); } -// src1 tensor is already in VTCM spad -static void matmul_2d(unsigned int nth, unsigned int ith, void * data) { +static void quantize_f32_q8_1_tiled_block(unsigned int nth, unsigned int ith, void * data) { + struct htp_mm_context * mmctx = data; + struct htp_ops_context * octx = mmctx->octx; + struct htp_thread_trace * tr = octx->ctx ? &octx->ctx->trace[ith] : NULL; + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_A_QUANT, mmctx->quant_ib_first[ith]); + + const struct htp_tensor * src = octx->src[1]; + + quantize_f32_q8_1_tiled_block_kernel( + (const float *) src->data, + mmctx->vtcm_src1, + (uint8_t *) mmctx->vtcm_src0 + (mmctx->vtcm_src0_size_per_thread * ith), + src->ne[0], + mmctx->quant_ib_first[ith], + mmctx->quant_ib_last[ith], + src->nb[1], + htp_mm_q8_1_tiled_row_size(src->ne[0]), + mmctx->quant_r[ith], + mmctx->quant_c[ith] + ); + + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_A_QUANT, mmctx->quant_ib_first[ith]); +} + +MATVEC_2D_REPACKED_IMPL(q4_0, 576, tiled_vec_dot_q4_0_32x1) +MATVEC_2D_REPACKED_IMPL(q4_1, 640, tiled_vec_dot_q4_1_32x1) +MATVEC_2D_REPACKED_IMPL(q8_0, 1088, tiled_vec_dot_q8_0_32x1) +MATVEC_2D_REPACKED_IMPL(iq4nl, 576, tiled_vec_dot_iq4nl_32x1) +MATVEC_2D_REPACKED_IMPL(mxfp4, 544, tiled_vec_dot_mxfp4_32x1) + +MATVEC_2D_REPACKED_IMPL(q4_0_flat, 576, flat_vec_dot_q4_0_32x1) +MATVEC_2D_REPACKED_IMPL(q4_1_flat, 640, flat_vec_dot_q4_1_32x1) +MATVEC_2D_REPACKED_IMPL(q8_0_flat, 1088, flat_vec_dot_q8_0_32x1) +MATVEC_2D_REPACKED_IMPL(iq4nl_flat, 576, flat_vec_dot_iq4nl_32x1) +MATVEC_2D_REPACKED_IMPL(mxfp4_flat, 544, flat_vec_dot_mxfp4_32x1) + + +MATMUL_QKV_2D_REPACKED_IMPL(q4_0, 576, tiled_vec_dot_q4_0_32x2, tiled_vec_dot_q4_0_32x1) +MATMUL_QKV_2D_REPACKED_IMPL(q4_1, 640, tiled_vec_dot_q4_1_32x2, tiled_vec_dot_q4_1_32x1) +MATMUL_QKV_2D_REPACKED_IMPL(q8_0, 1088, tiled_vec_dot_q8_0_32x2, tiled_vec_dot_q8_0_32x1) +MATMUL_QKV_2D_REPACKED_IMPL(iq4nl, 576, tiled_vec_dot_iq4nl_32x2, tiled_vec_dot_iq4nl_32x1) +MATMUL_QKV_2D_REPACKED_IMPL(mxfp4, 544, tiled_vec_dot_mxfp4_32x2, tiled_vec_dot_mxfp4_32x1) + +MATMUL_QKV_2D_REPACKED_IMPL(q4_0_flat, 576, flat_vec_dot_q4_0_32x2, flat_vec_dot_q4_0_32x1) +MATMUL_QKV_2D_REPACKED_IMPL(q4_1_flat, 640, flat_vec_dot_q4_1_32x2, flat_vec_dot_q4_1_32x1) +MATMUL_QKV_2D_REPACKED_IMPL(q8_0_flat, 1088, flat_vec_dot_q8_0_32x2, flat_vec_dot_q8_0_32x1) +MATMUL_QKV_2D_REPACKED_IMPL(iq4nl_flat, 576, flat_vec_dot_iq4nl_32x2, flat_vec_dot_iq4nl_32x1) +MATMUL_QKV_2D_REPACKED_IMPL(mxfp4_flat, 544, flat_vec_dot_mxfp4_32x2, flat_vec_dot_mxfp4_32x1) + + +MATMUL_FFN_2D_REPACKED_IMPL(q4_0, 576, tiled_vec_dot_q4_0_32x2, tiled_vec_dot_q4_0_32x1) +MATMUL_FFN_2D_REPACKED_IMPL(q4_1, 640, tiled_vec_dot_q4_1_32x2, tiled_vec_dot_q4_1_32x1) +MATMUL_FFN_2D_REPACKED_IMPL(q8_0, 1088, tiled_vec_dot_q8_0_32x2, tiled_vec_dot_q8_0_32x1) +MATMUL_FFN_2D_REPACKED_IMPL(iq4nl, 576, tiled_vec_dot_iq4nl_32x2, tiled_vec_dot_iq4nl_32x1) +MATMUL_FFN_2D_REPACKED_IMPL(mxfp4, 544, tiled_vec_dot_mxfp4_32x2, tiled_vec_dot_mxfp4_32x1) + +MATMUL_FFN_2D_REPACKED_IMPL(q4_0_flat, 576, flat_vec_dot_q4_0_32x2, flat_vec_dot_q4_0_32x1) +MATMUL_FFN_2D_REPACKED_IMPL(q4_1_flat, 640, flat_vec_dot_q4_1_32x2, flat_vec_dot_q4_1_32x1) +MATMUL_FFN_2D_REPACKED_IMPL(q8_0_flat, 1088, flat_vec_dot_q8_0_32x2, flat_vec_dot_q8_0_32x1) +MATMUL_FFN_2D_REPACKED_IMPL(iq4nl_flat, 576, flat_vec_dot_iq4nl_32x2, flat_vec_dot_iq4nl_32x1) +MATMUL_FFN_2D_REPACKED_IMPL(mxfp4_flat, 544, flat_vec_dot_mxfp4_32x2, flat_vec_dot_mxfp4_32x1) + +static void hvx_mm_2d(unsigned int nth, unsigned int ith, void * data) { htp_matmul_preamble; + const struct htp_mm_kernel_params * kparams = (const struct htp_mm_kernel_params *) octx->kernel_params; + const uint32_t n_prefetch = kparams->n_prefetch; + assert(n_prefetch >= 2 && n_prefetch <= HTP_MM_MAX_PREFETCH && (n_prefetch & (n_prefetch - 1)) == 0); + const uint32_t prefetch_mask = n_prefetch - 1; + const uint32_t src0_nrows = ne01 * ne02 * ne03; // src0 rows const uint32_t src1_nrows = ne11 * ne12 * ne13; // src1 rows @@ -3443,40 +1235,38 @@ static void matmul_2d(unsigned int nth, unsigned int ith, void * data) { return; } + struct htp_thread_trace * tr = octx->ctx ? &octx->ctx->trace[ith] : NULL; + const size_t dst_row_size = nb1; const size_t src0_row_size = nb01; const size_t src1_row_size = nb11; - const size_t src0_stride = src0_spad->stride; - const size_t src1_stride = src1_spad->stride; + const size_t src0_stride = mmctx->vtcm_src0_stride; + const size_t src1_stride = mmctx->vtcm_src1_stride; - // Per-thread VTCM scratchpads for all tensors - // Note that the entire src1 tensor is already in VTCM - // For other tensors we allocate N rows per thread, padded to HVX vector size - uint8_t * restrict spad_dst = dst_spad->data + dst_spad->size_per_thread * ith; - uint8_t * restrict spad_src0 = src0_spad->data + src0_spad->size_per_thread * ith; - uint8_t * restrict src1_data = src1_spad->data; - - volatile uint64_t t1, t2; - t1 = HAP_perf_get_qtimer_count(); + // Per-thread VTCMs for all tensors + uint8_t * restrict vtcm_dst_ptr = mmctx->vtcm_dst + mmctx->vtcm_dst_size_per_thread * ith; + uint8_t * restrict vtcm_src0_ptr = mmctx->vtcm_src0 + mmctx->vtcm_src0_size_per_thread * ith; + uint8_t * restrict src1_data = mmctx->vtcm_src1; const uint8_t * restrict src0_row = (const uint8_t *) src0->data; - // Prefill spad with src0 rows + // Prefill vtcm with src0 rows #pragma unroll(4) for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) { const int is0 = (ir0 - src0_start_row); - if (is0 >= MM_SPAD_SRC0_NROWS) { + if (is0 >= (int)n_prefetch) { break; } - dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_stride, src0_row + ir0 * src0_row_size), - src0_stride, src0_row_size, 2); + dma_queue_push(dma_queue, dma_make_ptr(vtcm_src0_ptr + is0 * src0_stride, src0_row + ir0 * src0_row_size), + src0_stride, src0_row_size, src0_row_size, 2); } // Process src0 rows for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) { const uint8_t * ss0 = dma_queue_pop(dma_queue).dst; + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, ir0); // Process src1 columns in pairs (2×2 tiling) uint32_t ir1 = 0; for (; ir1 + 1 < src1_nrows; ir1 += 2) { @@ -3493,42 +1283,37 @@ static void matmul_2d(unsigned int nth, unsigned int ith, void * data) { float * restrict dst_row = (float *) (dst->data + (ir1 * dst_row_size)); mmctx->vec_dot_2x1(ne00, &dst_row[ir0], ss0, ss0 + src0_stride, src1_col); } + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, ir0); - // Prefetch next (n + spad_nrows) row - const int pr0 = (ir0 + MM_SPAD_SRC0_NROWS); - const int is0 = (pr0 - src0_start_row) % MM_SPAD_SRC0_NROWS; + // Prefetch next (n + vtcm_nrows) row + const int pr0 = (ir0 + n_prefetch); + const int is0 = (pr0 - src0_start_row) & prefetch_mask; if (pr0 < src0_end_row_x2) { - dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_stride, src0_row + pr0 * src0_row_size), - src0_stride, src0_row_size, 2); + dma_queue_push(dma_queue, dma_make_ptr(vtcm_src0_ptr + is0 * src0_stride, src0_row + pr0 * src0_row_size), + src0_stride, src0_row_size, src0_row_size, 2); } } // Process the last row (if any) if (src0_end_row != src0_end_row_x2) { uint32_t ir0 = src0_end_row_x2; - const int is0 = (ir0 - src0_start_row) % MM_SPAD_SRC0_NROWS; - dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_stride, src0_row + ir0 * src0_row_size), - src0_stride, src0_row_size, 1); + const int is0 = (ir0 - src0_start_row) & prefetch_mask; + dma_queue_push(dma_queue, dma_make_ptr(vtcm_src0_ptr + is0 * src0_stride, src0_row + ir0 * src0_row_size), + src0_stride, src0_row_size, src0_row_size, 1); const uint8_t * ss0 = dma_queue_pop(dma_queue).dst; + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, ir0); #pragma unroll(2) for (uint32_t ir1 = 0; ir1 < src1_nrows; ++ir1) { const uint8_t * restrict src1_col = (const uint8_t *) (src1_data + ir1 * src1_stride); float * restrict dst_row = (float *) (dst->data + (ir1 * dst_row_size)); mmctx->vec_dot_1x1(ne00, &dst_row[ir0], ss0, src1_col); } + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, ir0); } - - t2 = HAP_perf_get_qtimer_count(); - - FARF(HIGH, "matmul-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", mmctx->type, ith, nth, - src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0], src1->ne[1], - src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], - (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); } -// q8x4x2 src1 tensor is already in VTCM spad -static void matvec_2d(unsigned int nth, unsigned int ith, void * data) { +static void hvx_mv_2d(unsigned int nth, unsigned int ith, void * data) { htp_matmul_preamble; const uint32_t src0_nrows = ne01; @@ -3541,1247 +1326,1929 @@ static void matvec_2d(unsigned int nth, unsigned int ith, void * data) { return; } + struct htp_thread_trace * tr = octx->ctx ? &octx->ctx->trace[ith] : NULL; + const size_t dst_row_size = nb1; const size_t src0_row_size = nb01; const size_t src1_row_size = nb11; - const size_t src0_stride = src0_spad->stride; - const size_t src1_stride = src1_spad->stride; - - // Per-thread VTCM scratchpads for all tensors - // Note that the entire src1 tensor is already in VTCM - // For other tensors we allocate N rows per thread, padded to HVX vector size - uint8_t * spad_dst = dst_spad->data + dst_spad->size_per_thread * ith; - uint8_t * spad_src0 = src0_spad->data + src0_spad->size_per_thread * ith; - uint8_t * src1_data = src1_spad->data; + const size_t src0_stride = mmctx->vtcm_src0_stride; + const size_t src1_stride = mmctx->vtcm_src1_stride; - uint64_t t1, t2; - t1 = HAP_perf_get_qtimer_count(); + // Per-thread VTCMs for all tensors + uint8_t * vtcm_dst_ptr = mmctx->vtcm_dst + mmctx->vtcm_dst_size_per_thread * ith; + uint8_t * vtcm_src0_ptr = mmctx->vtcm_src0 + mmctx->vtcm_src0_size_per_thread * ith; + uint8_t * src1_data = mmctx->vtcm_src1; - float * tmp = (float *) spad_dst; + float * tmp = (float *) vtcm_dst_ptr; const uint8_t * restrict src0_row = (const uint8_t *) src0->data; const uint8_t * restrict src1_col = (const uint8_t *) src1_data; float * restrict dst_col = (float *) dst->data; - if (mmctx->vec_dot_4x1 != NULL) { - const uint32_t src0_end_row_x4 = src0_start_row + ((src0_end_row - src0_start_row) & ~3U); + const uint32_t src0_end_row_x2 = src0_start_row + ((src0_end_row - src0_start_row) & ~1U); - // Prefill spad with 4x src0 rows - #pragma unroll(4) - for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x4; ir0 += 4) { - const uint32_t is0 = (ir0 - src0_start_row); - if (is0 >= MM_SPAD_SRC0_NROWS) { - break; - } - dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_stride, src0_row + ir0 * src0_row_size), - src0_stride, src0_row_size, 4); + const struct htp_mm_kernel_params * kparams = (const struct htp_mm_kernel_params *) octx->kernel_params; + const uint32_t n_prefetch = kparams->n_prefetch; + assert(n_prefetch >= 2 && n_prefetch <= HTP_MM_MAX_PREFETCH && (n_prefetch & (n_prefetch - 1)) == 0); + const uint32_t prefetch_mask = n_prefetch - 1; + + // Prefill vtcm with 2x src0 rows + #pragma unroll(2) + for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) { + const uint32_t is0 = (ir0 - src0_start_row); + if (is0 >= n_prefetch) { + break; } + dma_queue_push(dma_queue, dma_make_ptr(vtcm_src0_ptr + is0 * src0_stride, src0_row + ir0 * src0_row_size), + src0_stride, src0_row_size, src0_row_size, 2); + } - // Process src0 rows - for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x4; ir0 += 4) { - const uint8_t * ss0 = dma_queue_pop(dma_queue).dst; - mmctx->vec_dot_4x1(ne00, &tmp[ir0 - src0_start_row], ss0, ss0 + src0_stride, ss0 + 2 * src0_stride, ss0 + 3 * src0_stride, src1_col); - - // Prefetch next (n + spad_nrows) row - const uint32_t pr0 = (ir0 + MM_SPAD_SRC0_NROWS); - const uint32_t is0 = (pr0 - src0_start_row) % MM_SPAD_SRC0_NROWS; - if (pr0 < src0_end_row_x4) { - dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_stride, src0_row + pr0 * src0_row_size), - src0_stride, src0_row_size, 4); - } + // Process src0 rows + for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) { + const uint8_t * ss0 = dma_queue_pop(dma_queue).dst; + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, ir0); + mmctx->vec_dot_2x1(ne00, &tmp[ir0 - src0_start_row], ss0, ss0 + src0_stride, src1_col); + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, ir0); + + // Prefetch next (n + vtcm_nrows) row + const uint32_t pr0 = (ir0 + n_prefetch); + const uint32_t is0 = (pr0 - src0_start_row) & prefetch_mask; + if (pr0 < src0_end_row_x2) { + dma_queue_push(dma_queue, dma_make_ptr(vtcm_src0_ptr + is0 * src0_stride, src0_row + pr0 * src0_row_size), + src0_stride, src0_row_size, src0_row_size, 2); } + } + + // Process the last row (if any) + if (src0_end_row != src0_end_row_x2) { + const uint32_t ir0 = src0_end_row_x2; + const uint32_t is0 = (ir0 - src0_start_row) & prefetch_mask; + dma_queue_push(dma_queue, dma_make_ptr(vtcm_src0_ptr + is0 * src0_stride, src0_row + ir0 * src0_row_size), + src0_stride, src0_row_size, src0_row_size, 1); + const uint8_t * ss0 = dma_queue_pop(dma_queue).dst; + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, ir0); + mmctx->vec_dot_1x1(ne00, &tmp[ir0 - src0_start_row], ss0, src1_col); + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, ir0); + } + + hvx_copy_f32_ua((uint8_t *) &dst_col[src0_start_row], (uint8_t *) tmp, src0_end_row - src0_start_row); +} + +#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id) * ids->ne[0] * ids->ne[1] + (i1)] + +static void hvx_mm_id(unsigned int nth, unsigned int ith, void * data) { + htp_matmul_preamble; + + const struct htp_tensor * restrict ids = octx->src[2]; + + uint64_t t1, t2; + t1 = HAP_perf_get_qtimer_count(); + + const uint32_t src0_nrows = ne01; // src0 rows per expert + const uint32_t src1_nrows = ne11; + const uint32_t src0_start_row = src0_nrows_per_thread * ith; + const uint32_t src0_end_row = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows); + + // no work for this thread + if (src0_start_row >= src0_end_row) { + return; + } + + struct htp_thread_trace * tr = octx->ctx ? &octx->ctx->trace[ith] : NULL; + + const struct htp_mm_kernel_params * kparams = (const struct htp_mm_kernel_params *) octx->kernel_params; + const uint32_t n_prefetch = kparams->n_prefetch; + assert(n_prefetch >= 2 && n_prefetch <= HTP_MM_MAX_PREFETCH && (n_prefetch & (n_prefetch - 1)) == 0); + + const uint32_t n_ids = ids->ne[0]; // n_expert_used + const uint32_t n_as = ne02; // n_expert + + const uint32_t * matrix_row_counts = mmctx->matrix_row_counts; + const struct mmid_row_mapping * matrix_rows = mmctx->matrix_rows; + + const size_t dst_row_size = nb1; + const size_t src1_row_size = htp_mm_q8_0_tiled_row_size(ne10); + + const size_t src1_stride = mmctx->vtcm_src1_stride; - // Process leftovers - uint32_t ir0 = src0_end_row_x4; - if (ir0 + 2 <= src0_end_row) { - const uint32_t is0 = (ir0 - src0_start_row) % MM_SPAD_SRC0_NROWS; - dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_stride, src0_row + ir0 * src0_row_size), - src0_stride, src0_row_size, 2); - const uint8_t * ss0 = dma_queue_pop(dma_queue).dst; - mmctx->vec_dot_2x1(ne00, &tmp[ir0 - src0_start_row], ss0, ss0 + src0_stride, src1_col); - ir0 += 2; + // Per-thread VTCMs for all tensors + uint8_t * restrict vtcm_src0_ptr = mmctx->vtcm_src0 + mmctx->vtcm_src0_size_per_thread * ith; + uint8_t * restrict src1_data = mmctx->vtcm_src1; + + for (uint32_t cur_a = 0; cur_a < n_as; ++cur_a) { + const int32_t cne1 = matrix_row_counts[cur_a]; + if (cne1 == 0) { + continue; } - if (ir0 < src0_end_row) { - const uint32_t is0 = (ir0 - src0_start_row) % MM_SPAD_SRC0_NROWS; - dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_stride, src0_row + ir0 * src0_row_size), - src0_stride, src0_row_size, 1); - const uint8_t * ss0 = dma_queue_pop(dma_queue).dst; - mmctx->vec_dot_1x1(ne00, &tmp[ir0 - src0_start_row], ss0, src1_col); - ir0 += 1; + + const uint8_t * src0_row = (const uint8_t *) src0->data + cur_a * nb02; + + const uint32_t tile_size = htp_mm_get_weight_tile_size(src0->type); + const uint32_t aligned_tile_size = htp_mm_get_weight_aligned_tile_size(src0->type); + const uint32_t n_k_tiles_w = ne00 / 32; + const uint32_t n_k_tiles_a = ne10 / 32; + const uint32_t tile_row_stride = n_k_tiles_w * tile_size; + const uint32_t tile_row_transfer_size_aligned = n_k_tiles_a * aligned_tile_size; + + const uint32_t ct_start = src0_start_row / 32; + const uint32_t ct_end = (src0_end_row + 31) / 32; + + uint32_t push_ct = ct_start; + for (uint32_t d = 0; d < n_prefetch && push_ct < ct_end; d++, push_ct++) { + dma_queue_push(dma_queue, dma_make_ptr(vtcm_src0_ptr + d * tile_row_transfer_size_aligned, src0_row + push_ct * tile_row_stride), + aligned_tile_size, tile_size, tile_size, n_k_tiles_a); } - } else { - const uint32_t src0_end_row_x2 = src0_start_row + ((src0_end_row - src0_start_row) & ~1U); - // Prefill spad with 2x src0 rows - #pragma unroll(2) - for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) { - const uint32_t is0 = (ir0 - src0_start_row); - if (is0 >= MM_SPAD_SRC0_NROWS) { - break; + for (uint32_t ct = ct_start; ct < ct_end; ct++) { + const uint8_t * w_tile = dma_queue_pop(dma_queue).dst; + + int valid_rows = (int)ne01 - (int)(ct * 32); + valid_rows = MIN(32, MAX(0, valid_rows)); + + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, ct); + for (uint32_t cid = 0; cid < cne1; ++cid) { + struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, cid); + const int rm1 = row_mapping.i1; // expert idx + const int rm2 = row_mapping.i2; // token idx + + const uint32_t ir1 = fastmodulo(rm1, ne11, &mmctx->mm_div_ne11); // src1 row idx + const uint8_t * restrict src1_col = (const uint8_t *) (src1_data + (ir1 + rm2 * ne11 + 0) * src1_stride); + float * restrict dst_row = (float *) (dst->data + (rm1 * nb1 + rm2 * nb2 + 0)); + + mmctx->vec_dot_32x1(ne10, &dst_row[ct * 32], w_tile, src1_col, valid_rows); } - dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_stride, src0_row + ir0 * src0_row_size), - src0_stride, src0_row_size, 2); - } + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, ct); - // Process src0 rows - for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) { - const uint8_t * ss0 = dma_queue_pop(dma_queue).dst; - mmctx->vec_dot_2x1(ne00, &tmp[ir0 - src0_start_row], ss0, ss0 + src0_stride, src1_col); - - // Prefetch next (n + spad_nrows) row - const uint32_t pr0 = (ir0 + MM_SPAD_SRC0_NROWS); - const uint32_t is0 = (pr0 - src0_start_row) % MM_SPAD_SRC0_NROWS; - if (pr0 < src0_end_row_x2) { - dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_stride, src0_row + pr0 * src0_row_size), - src0_stride, src0_row_size, 2); + if (push_ct < ct_end) { + dma_queue_push(dma_queue, dma_make_ptr((uint8_t *)w_tile, src0_row + push_ct * tile_row_stride), + aligned_tile_size, tile_size, tile_size, n_k_tiles_a); + push_ct++; } } + } +} - // Process the last row (if any) - if (src0_end_row != src0_end_row_x2) { - const uint32_t ir0 = src0_end_row_x2; - const uint32_t is0 = (ir0 - src0_start_row) % MM_SPAD_SRC0_NROWS; - dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_stride, src0_row + ir0 * src0_row_size), - src0_stride, src0_row_size, 1); - const uint8_t * ss0 = dma_queue_pop(dma_queue).dst; - mmctx->vec_dot_1x1(ne00, &tmp[ir0 - src0_start_row], ss0, src1_col); - } +static void hvx_mv_id(unsigned int nth, unsigned int ith, void * data) { + htp_matmul_preamble; + + const struct htp_tensor * restrict ids = octx->src[2]; + + const uint32_t src0_nrows = ne01; // src0 rows per expert + const uint32_t src0_start_row = src0_nrows_per_thread * ith; + const uint32_t src0_end_row = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows); + + // no work for this thread + if (src0_start_row >= src0_end_row) { + return; } - hvx_copy_f32_ua((uint8_t *) &dst_col[src0_start_row], (uint8_t *) tmp, src0_end_row - src0_start_row); + struct htp_thread_trace * tr = octx->ctx ? &octx->ctx->trace[ith] : NULL; + + const struct htp_mm_kernel_params * kparams = (const struct htp_mm_kernel_params *) octx->kernel_params; + const uint32_t n_prefetch = kparams->n_prefetch; + assert(n_prefetch >= 2 && n_prefetch <= HTP_MM_MAX_PREFETCH && (n_prefetch & (n_prefetch - 1)) == 0); + + assert(ne13 % ne03 == 0); + + const size_t dst_row_size = nb1; + const size_t src1_row_size = htp_mm_q8_0_tiled_row_size(ne10); + + const uint32_t n_aids = src2->ne[0]; // num activated experts + const uint32_t n_ids = ne02; // num experts + + // Per-thread VTCMs for all tensors + uint8_t * restrict vtcm_src0_ptr = mmctx->vtcm_src0 + mmctx->vtcm_src0_size_per_thread * ith; + uint8_t * restrict src1_data = mmctx->vtcm_src1; + + for (uint32_t ie1 = 0; ie1 < n_aids; ++ie1) { // for each expert + const int32_t eid = *(const int32_t *) ((const uint8_t *) src2->data + ie1 * src2->nb[0]); + if (eid < 0) { + continue; + } + assert(eid < (int32_t) n_ids); + + const uint8_t * restrict src0_row = (const uint8_t *) src0->data + eid * nb02; + const uint8_t * restrict src1_col = (const uint8_t *) src1_data; + float * restrict dst_row = (float *) (dst->data + ie1 * nb1); + + const uint32_t tile_size = htp_mm_get_weight_tile_size(src0->type); + const uint32_t aligned_tile_size = htp_mm_get_weight_aligned_tile_size(src0->type); + const uint32_t n_k_tiles_w = ne00 / 32; + const uint32_t n_k_tiles_a = ne10 / 32; + const uint32_t tile_row_stride = n_k_tiles_w * tile_size; + const uint32_t tile_row_transfer_size_aligned = n_k_tiles_a * aligned_tile_size; + + const uint32_t ct_start = src0_start_row / 32; + const uint32_t ct_end = (src0_end_row + 31) / 32; + + uint32_t push_ct = ct_start; + for (uint32_t d = 0; d < n_prefetch && push_ct < ct_end; d++, push_ct++) { + dma_queue_push(dma_queue, dma_make_ptr(vtcm_src0_ptr + d * tile_row_transfer_size_aligned, src0_row + push_ct * tile_row_stride), + aligned_tile_size, tile_size, tile_size, n_k_tiles_a); + } + + for (uint32_t ct = ct_start; ct < ct_end; ct++) { + const uint8_t * w_tile = dma_queue_pop(dma_queue).dst; - t2 = HAP_perf_get_qtimer_count(); + int valid_rows = (int)ne01 - (int)(ct * 32); + valid_rows = MIN(32, MAX(0, valid_rows)); - FARF(HIGH, "matvec-%s %u/%u: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u -> %ux%ux%ux%u usec %u\n", mmctx->type, ith, nth, - src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0], src1->ne[1], - src1->ne[2], src1->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], - (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_COMP, ct); + mmctx->vec_dot_32x1(ne10, &dst_row[ct * 32], w_tile, src1_col, valid_rows); + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_COMP, ct); + + if (push_ct < ct_end) { + dma_queue_push(dma_queue, dma_make_ptr((uint8_t *)w_tile, src0_row + push_ct * tile_row_stride), + aligned_tile_size, tile_size, tile_size, n_k_tiles_a); + push_ct++; + } + } + } } -#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id) * ids->ne[0] * ids->ne[1] + (i1)] +static int hvx_mm_init_vec_dot(struct htp_mm_context * mmctx, enum htp_data_type type) { + switch (type) { + case HTP_TYPE_Q4_0: + mmctx->type = "q4_0_tiled-f32"; + mmctx->vec_dot_32x1 = tiled_vec_dot_q4_0_32x1; + return 0; + case HTP_TYPE_Q4_1: + mmctx->type = "q4_1_tiled-f32"; + mmctx->vec_dot_32x1 = tiled_vec_dot_q4_1_32x1; + return 0; + case HTP_TYPE_Q8_0: + mmctx->type = "q8_0_tiled-f32"; + mmctx->vec_dot_32x1 = tiled_vec_dot_q8_0_32x1; + return 0; + case HTP_TYPE_IQ4_NL: + mmctx->type = "iq4nl_tiled-f32"; + mmctx->vec_dot_32x1 = tiled_vec_dot_iq4nl_32x1; + return 0; + case HTP_TYPE_MXFP4: + mmctx->type = "mxfp4_tiled-f32"; + mmctx->vec_dot_32x1 = tiled_vec_dot_mxfp4_32x1; + return 0; + default: + return -1; + } +} -struct mmid_row_mapping { - uint32_t i1; - uint32_t i2; -}; +static int hvx_mm_matmul(struct htp_ops_context * octx) { + htp_matmul_tensors_preamble; -// src1 tensor is already in VTCM spad -static void matmul_id(unsigned int nth, unsigned int ith, void * data) { - htp_matmul_preamble; + struct htp_mm_context mmctx_struct = {0}; + struct htp_mm_context * mmctx = &mmctx_struct; + mmctx->octx = octx; - const struct htp_tensor * restrict ids = octx->src[2]; - struct htp_spad * restrict src2_spad = &octx->src2_spad; + const struct htp_mm_kernel_params * kparams = (const struct htp_mm_kernel_params *) octx->kernel_params; - uint64_t t1, t2; - t1 = HAP_perf_get_qtimer_count(); + const uint32_t src0_nrows = ne01 * ne02 * ne03; + const uint32_t src1_nrows = ne11 * ne12 * ne13; + + bool is_repacked = (src0->type == HTP_TYPE_Q4_0 || src0->type == HTP_TYPE_Q4_1 || + src0->type == HTP_TYPE_Q8_0 || src0->type == HTP_TYPE_IQ4_NL || + src0->type == HTP_TYPE_MXFP4); + + // Compute src0_nrows_per_thread + mmctx->src0_nrows_per_thread = (src0_nrows + octx->n_threads - 1) / octx->n_threads; + if (is_repacked) { + mmctx->src0_nrows_per_thread = hex_round_up(mmctx->src0_nrows_per_thread, 32); + } else { + mmctx->src0_nrows_per_thread += (mmctx->src0_nrows_per_thread & 1); // round up to even + } - const uint32_t src0_nrows = ne01; // src0 rows per expert - const uint32_t src1_nrows = ne11; + const size_t src0_row_size = nb01; + const size_t dst_row_size = nb1; + size_t src1_row_size = nb11; - const uint32_t src0_start_row = src0_nrows_per_thread * ith; - const uint32_t src0_end_row = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows); - const uint32_t src0_end_row_x2 = src0_start_row + ((src0_end_row - src0_start_row) & ~1U); + const size_t src0_row_size_padded = hex_round_up(src0_row_size, 128); + size_t src1_row_size_padded; - // no work for this thread - if (src0_start_row >= src0_end_row) { - return; + worker_callback_t quant_job_func; + worker_callback_t matmul_job_func; + uint32_t n_quant_jobs = 1; + if (src1_nrows > 1) { + if (is_repacked) { + switch (src0->type) { + case HTP_TYPE_Q4_0: matmul_job_func = hvx_mm_2d_repacked_q4_0; break; + case HTP_TYPE_Q4_1: matmul_job_func = hvx_mm_2d_repacked_q4_1; break; + case HTP_TYPE_Q8_0: matmul_job_func = hvx_mm_2d_repacked_q8_0; break; + case HTP_TYPE_IQ4_NL: matmul_job_func = hvx_mm_2d_repacked_iq4nl; break; + case HTP_TYPE_MXFP4: matmul_job_func = hvx_mm_2d_repacked_mxfp4; break; + default: return HTP_STATUS_NO_SUPPORT; + } + } else { + matmul_job_func = hvx_mm_2d; + } + } else { + if (is_repacked) { + switch (src0->type) { + case HTP_TYPE_Q4_0: matmul_job_func = hvx_mv_2d_repacked_q4_0; break; + case HTP_TYPE_Q4_1: matmul_job_func = hvx_mv_2d_repacked_q4_1; break; + case HTP_TYPE_Q8_0: matmul_job_func = hvx_mv_2d_repacked_q8_0; break; + case HTP_TYPE_IQ4_NL: matmul_job_func = hvx_mv_2d_repacked_iq4nl; break; + case HTP_TYPE_MXFP4: matmul_job_func = hvx_mv_2d_repacked_mxfp4; break; + default: return HTP_STATUS_NO_SUPPORT; + } + } else { + matmul_job_func = hvx_mv_2d; + } } - const uint32_t n_ids = ids->ne[0]; // n_expert_used - const uint32_t n_as = ne02; // n_expert - - const uint32_t * matrix_row_counts = mmctx->matrix_row_counts; - const struct mmid_row_mapping * matrix_rows = mmctx->matrix_rows; + bool need_quant = true; - const size_t dst_row_size = nb1; - const size_t src0_row_size = nb01; - const size_t src1_row_size = q8x4x2_row_size(ne10); + switch (kparams->kernel_type) { + case HTP_MM_KERNEL_HVX_F16_F16_VTCM: + quant_job_func = (src1->type == HTP_TYPE_F32) ? quantize_f32_f16_flat : quantize_f16_f16_flat; + mmctx->type = "f16-f16"; + mmctx->vec_dot_1x1 = vec_dot_f16_f16_aa_1x1; + mmctx->vec_dot_2x1 = vec_dot_f16_f16_aa_2x1; + mmctx->vec_dot_2x2 = vec_dot_f16_f16_aa_2x2; + src1_row_size = hex_round_up(ne10 * 2, 128); + break; - const size_t src0_row_size_padded = hex_round_up(src0_row_size, 128); + case HTP_MM_KERNEL_HVX_F16_F32_DDR: + mmctx->type = "f16-f32"; + mmctx->vec_dot_1x1 = vec_dot_f16_f32_uu_1x1; + matmul_job_func = hvx_mm_4d; + mmctx->mm_div_ne12_ne1 = kparams->div_ne12_ne1; + mmctx->mm_div_ne1 = kparams->div_ne1; + mmctx->mm_div_r2 = kparams->div_r2; + mmctx->mm_div_r3 = kparams->div_r3; + need_quant = false; + quant_job_func = NULL; + src1_row_size = nb11; + break; - // Per-thread VTCM scratchpads for all tensors - // Note that the entire src1 tensor is already in VTCM - // For other tensors we allocate N rows per thread, padded to HVX vector size - uint8_t * restrict spad_dst = dst_spad->data + dst_spad->size_per_thread * ith; - uint8_t * restrict spad_src0 = src0_spad->data + src0_spad->size_per_thread * ith; - uint8_t * restrict src1_data = src1_spad->data; + case HTP_MM_KERNEL_HVX_F16_F16_DDR: + mmctx->type = "f16-f16"; + mmctx->vec_dot_1x1 = vec_dot_f16_f16_uu_1x1; + matmul_job_func = hvx_mm_4d; + mmctx->mm_div_ne12_ne1 = kparams->div_ne12_ne1; + mmctx->mm_div_ne1 = kparams->div_ne1; + mmctx->mm_div_r2 = kparams->div_r2; + mmctx->mm_div_r3 = kparams->div_r3; + src1_row_size = nb11; + need_quant = false; + quant_job_func = NULL; + break; - for (uint32_t cur_a = 0; cur_a < n_as; ++cur_a) { - const int32_t cne1 = matrix_row_counts[cur_a]; + case HTP_MM_KERNEL_HVX_F32_F32_VTCM: + quant_job_func = quantize_f32_f32_flat; + mmctx->type = "f32-f32"; + mmctx->vec_dot_1x1 = vec_dot_f32_f32_aa_1x1; + mmctx->vec_dot_2x1 = vec_dot_f32_f32_aa_2x1; + mmctx->vec_dot_2x2 = vec_dot_f32_f32_aa_2x2; + src1_row_size = hex_round_up(ne10 * 4, 128); + break; - if (cne1 == 0) { - continue; - } + case HTP_MM_KERNEL_HVX_F32_F32_DDR: + quant_job_func = NULL; + mmctx->type = "f32-f32"; + mmctx->vec_dot_1x1 = vec_dot_f32_f32_uu_1x1; + mmctx->mm_div_ne12_ne1 = kparams->div_ne12_ne1; + mmctx->mm_div_ne1 = kparams->div_ne1; + mmctx->mm_div_r2 = kparams->div_r2; + mmctx->mm_div_r3 = kparams->div_r3; + src1_row_size = nb11; + need_quant = false; + matmul_job_func = hvx_mm_4d; + break; - if (mmctx->hmx_eligible) { - continue; + case HTP_MM_KERNEL_HVX_QUANT_ROW_FLAT: { + n_quant_jobs = MIN(src1_nrows, octx->n_threads); + quant_job_func = (src0->type == HTP_TYPE_Q4_1) ? quantize_f32_q8_1_flat : quantize_f32_q8_0_flat; + src1_row_size = (src0->type == HTP_TYPE_Q4_1) ? htp_mm_q8_1_flat_row_size(ne10) : htp_mm_q8_0_flat_row_size(ne10); + + if (src1_nrows > 1) { + switch (src0->type) { + case HTP_TYPE_Q4_0: matmul_job_func = hvx_mm_2d_repacked_q4_0_flat; break; + case HTP_TYPE_Q4_1: matmul_job_func = hvx_mm_2d_repacked_q4_1_flat; break; + case HTP_TYPE_Q8_0: matmul_job_func = hvx_mm_2d_repacked_q8_0_flat; break; + case HTP_TYPE_IQ4_NL: matmul_job_func = hvx_mm_2d_repacked_iq4nl_flat; break; + case HTP_TYPE_MXFP4: matmul_job_func = hvx_mm_2d_repacked_mxfp4_flat; break; + default: return HTP_STATUS_NO_SUPPORT; + } + } else { + switch (src0->type) { + case HTP_TYPE_Q4_0: matmul_job_func = hvx_mv_2d_repacked_q4_0_flat; break; + case HTP_TYPE_Q4_1: matmul_job_func = hvx_mv_2d_repacked_q4_1_flat; break; + case HTP_TYPE_Q8_0: matmul_job_func = hvx_mv_2d_repacked_q8_0_flat; break; + case HTP_TYPE_IQ4_NL: matmul_job_func = hvx_mv_2d_repacked_iq4nl_flat; break; + case HTP_TYPE_MXFP4: matmul_job_func = hvx_mv_2d_repacked_mxfp4_flat; break; + default: return HTP_STATUS_NO_SUPPORT; + } + } + break; } - const uint8_t * src0_row = (const uint8_t *) src0->data + (0 + cur_a * nb02 + 0); + case HTP_MM_KERNEL_HVX_QUANT_BLOCK: + case HTP_MM_KERNEL_HVX_QUANT_ROW: + default: + if (hvx_mm_init_vec_dot(mmctx, src0->type) != 0) { + return HTP_STATUS_NO_SUPPORT; + } - // Prefill spad with src0 rows - #pragma unroll(4) - for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) { - const int is0 = (ir0 - src0_start_row); - if (is0 >= MM_SPAD_SRC0_NROWS) { - break; + const uint32_t qk = QK_Q8_0_TILED; + const uint32_t nb = (ne10 + qk - 1) / qk; + const uint32_t total_nb = src1_nrows * nb; + + if (src1_nrows < octx->n_threads) { + n_quant_jobs = MIN(total_nb, octx->n_threads); + quant_job_func = (src0->type == HTP_TYPE_Q4_1) ? quantize_f32_q8_1_tiled_block : quantize_f32_q8_0_tiled_block; + for (uint32_t ith = 0; ith < n_quant_jobs; ++ith) { + uint32_t ib_first = (total_nb * ith) / n_quant_jobs; + uint32_t ib_last = (total_nb * (ith + 1)) / n_quant_jobs; + mmctx->quant_ib_first[ith] = ib_first; + mmctx->quant_ib_last[ith] = ib_last; + mmctx->quant_r[ith] = ib_first / nb; + mmctx->quant_c[ith] = ib_first % nb; + } + } else { + n_quant_jobs = MIN(src1_nrows, octx->n_threads); + quant_job_func = (src0->type == HTP_TYPE_Q4_1) ? quantize_f32_q8_1_tiled : quantize_f32_q8_0_tiled; } - dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size), - src0_row_size_padded, src0_row_size, 2); - } + src1_row_size = (src0->type == HTP_TYPE_Q4_1) ? htp_mm_q8_1_tiled_row_size(ne10) : htp_mm_q8_0_tiled_row_size(ne10); + break; + } - // Process src0 rows - for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) { - const uint8_t * ss0 = dma_queue_pop(dma_queue).dst; + size_t src0_sz = 0, src1_sz = 0, dst_sz = 0; + if (kparams->vtcm_src0_size > 0 || kparams->vtcm_src1_size > 0 || kparams->vtcm_dst_size > 0) { + src0_sz = kparams->vtcm_src0_size; + src1_sz = kparams->vtcm_src1_size; + dst_sz = kparams->vtcm_dst_size; + } else { + const uint32_t n_prefetch = kparams->n_prefetch; + assert(n_prefetch >= 2 && n_prefetch <= HTP_MM_MAX_PREFETCH && (n_prefetch & (n_prefetch - 1)) == 0); + htp_mm_hvx_get_vtcm_sizes( + kparams->kernel_type, src0->type, ne10, src1_nrows, octx->n_threads, + dst_row_size, src0_row_size, src1_row_size, n_prefetch, + &src0_sz, &src1_sz, &dst_sz + ); + } + + if (kparams->kernel_type == HTP_MM_KERNEL_HVX_F16_F16_VTCM || + kparams->kernel_type == HTP_MM_KERNEL_HVX_F32_F32_VTCM || + kparams->kernel_type == HTP_MM_KERNEL_HVX_QUANT_ROW || + kparams->kernel_type == HTP_MM_KERNEL_HVX_QUANT_BLOCK) { + mmctx->vtcm_src1_size_per_thread = src1_sz; + } else { + mmctx->vtcm_src1_size_per_thread = src1_sz / octx->n_threads; + } - for (uint32_t cid = 0; cid < cne1; ++cid) { - struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, cid); - const int rm1 = row_mapping.i1; // expert idx - const int rm2 = row_mapping.i2; // token idx + mmctx->vtcm_src0_size_per_thread = src0_sz / octx->n_threads; + mmctx->vtcm_dst_size_per_thread = dst_sz / octx->n_threads; - const uint32_t ir1 = src1_nrows == 1 ? 0 : rm1; // src1 row idx - const uint8_t * restrict src1_col = (const uint8_t *) (src1_data + (ir1 + rm2 * ne11 + 0) * src1_row_size); - float * dst_row = (float *) (dst->data + (rm1 * nb1 + rm2 * nb2 + 0)); + size_t vtcm_size = kparams->vtcm_size > 0 ? (size_t)kparams->vtcm_size : (src1_sz + src0_sz + dst_sz); - mmctx->vec_dot_2x1(ne00, &dst_row[ir0], ss0, ss0 + src0_row_size_padded, src1_col); - } + FARF(HIGH, "matmul-%s : src0-vtcm-size %zu src1-vtcm-size %zu dst-vtcm-size %zu (%zu)\n", mmctx->type, + src0_sz, src1_sz, dst_sz, vtcm_size); - // Prefetch next (n + spad_nrows) row - const int pr0 = (ir0 + MM_SPAD_SRC0_NROWS); - const int is0 = (pr0 - src0_start_row) % MM_SPAD_SRC0_NROWS; - if (pr0 < src0_end_row_x2) { - dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size), - src0_row_size_padded, src0_row_size, 2); - } - } + FARF(HIGH, "matmul-%s : %ux%ux%ux%u * %ux%ux%ux%u-> %ux%ux%ux%u (0x%p, 0x%p, 0x%p)\n", mmctx->type, src0->ne[0], + src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], dst->ne[0], + dst->ne[1], dst->ne[2], dst->ne[3], src0->data, src1->data, dst->data); - // Process the last row (if any) - if (src0_end_row != src0_end_row_x2) { - uint32_t ir0 = src0_end_row_x2; - const uint32_t is0 = (ir0 - src0_start_row) % MM_SPAD_SRC0_NROWS; - dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size), - src0_row_size_padded, src0_row_size, 1); - const uint8_t * ss0 = dma_queue_pop(dma_queue).dst; + if (octx->ctx->vtcm_size < vtcm_size) { + FARF(ERROR, "matmul-%s : current VTCM reservation %zu is too small, needed %zu\n", mmctx->type, + octx->ctx->vtcm_size, vtcm_size); + return HTP_STATUS_VTCM_TOO_SMALL; + } - for (uint32_t cid = 0; cid < cne1; ++cid) { - struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, cid); - const int rm1 = row_mapping.i1; // expert idx - const int rm2 = row_mapping.i2; // token idx + uint8_t * vtcm_ptr = (uint8_t *) octx->ctx->vtcm_base; + mmctx->vtcm_src1 = vtcm_seq_alloc(&vtcm_ptr, src1_sz); + mmctx->vtcm_src0 = vtcm_seq_alloc(&vtcm_ptr, src0_sz); + mmctx->vtcm_dst = vtcm_seq_alloc(&vtcm_ptr, dst_sz); - const uint32_t ir1 = src1_nrows == 1 ? 0 : rm1; // src1 row idx - const uint8_t * restrict src1_col = (const uint8_t *) (src1_data + (ir1 + rm2 * ne11 + 0) * src1_row_size); - float * dst_row = (float *) (dst->data + (rm1 * nb1 + rm2 * nb2 + 0)); + octx->src1_spad.src = NULL; + octx->src0_spad.src = NULL; + octx->dst_spad.src = NULL; - mmctx->vec_dot_1x1(ne00, &dst_row[ir0], ss0, src1_col); - } - } + mmctx->vtcm_src0_stride = src0_row_size_padded; + mmctx->vtcm_src1_stride = src1_row_size; + + if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) + return HTP_STATUS_OK; + + if (need_quant) { + mmctx->src1_nrows_per_thread = (src1_nrows + n_quant_jobs - 1) / n_quant_jobs; + worker_pool_run_func(octx->ctx->worker_pool, quant_job_func, mmctx, n_quant_jobs); } - t2 = HAP_perf_get_qtimer_count(); + const uint32_t n_matmul_jobs = octx->n_threads; + worker_pool_run_func(octx->ctx->worker_pool, matmul_job_func, mmctx, n_matmul_jobs); - FARF(HIGH, "matmul-id-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u usec %u\n", mmctx->type, - ith, nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0], - src1->ne[1], src1->ne[2], src1->ne[3], ids->ne[0], ids->ne[1], ids->ne[2], ids->ne[3], dst->ne[0], dst->ne[1], - dst->ne[2], dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); + return HTP_STATUS_OK; } -// src1 tensor is already in VTCM spad -static void matvec_id(unsigned int nth, unsigned int ith, void * data) { - htp_matmul_preamble; +static void hvx_mm_qkv_2d(unsigned int nth, unsigned int ith, void * data) { + struct htp_mm_context * mmctx = data; + struct htp_ops_context * octx = mmctx->octx; - const struct htp_tensor * restrict ids = octx->src[2]; - struct htp_spad * restrict src2_spad = &octx->src2_spad; + const struct htp_tensor * restrict src0 = octx->src[0]; // Wk + const struct htp_tensor * restrict src1 = octx->src[1]; // x + const struct htp_tensor * restrict src2 = octx->src[2]; // Wv + const struct htp_tensor * restrict src3 = octx->src[3]; // Wq + const struct htp_tensor * restrict dst_k = octx->dsts[0]; + const struct htp_tensor * restrict dst_v = octx->dsts[1]; + const struct htp_tensor * restrict dst_q = octx->dsts[2]; - uint64_t t1, t2; - t1 = HAP_perf_get_qtimer_count(); + const uint32_t ne00 = src0->ne[0]; + const uint32_t ne01 = src0->ne[1]; + const uint32_t ne02 = src0->ne[2]; + const uint32_t ne03 = src0->ne[3]; - const uint32_t src0_nrows = ne01; // src0 rows per expert + const uint32_t ne11 = src1->ne[1]; + const uint32_t ne12 = src1->ne[2]; + const uint32_t ne13 = src1->ne[3]; + + const uint32_t src0_nrows = ne01 * ne02 * ne03; + const uint32_t src1_nrows = ne11 * ne12 * ne13; + const uint32_t src0_nrows_per_thread = mmctx->src0_nrows_per_thread; const uint32_t src0_start_row = src0_nrows_per_thread * ith; const uint32_t src0_end_row = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows); const uint32_t src0_end_row_x2 = src0_start_row + ((src0_end_row - src0_start_row) & ~1U); - // no work for this thread if (src0_start_row >= src0_end_row) { return; } - assert(ne13 % ne03 == 0); - - const size_t dst_row_size = nb1; - const size_t src0_row_size = nb01; - const size_t src1_row_size = q8x4x2_row_size(ne10); - - const size_t src0_row_size_padded = hex_round_up(src0_row_size, 128); - - const uint32_t n_aids = src2->ne[0]; // num activated experts - const uint32_t n_ids = ne02; // num experts + const size_t dst_k_row_size = dst_k->nb[1]; // K and V share output width + const size_t dst_q_row_size = dst_q->nb[1]; // Q may be wider (GQA) + const size_t src0_row_size = src0->nb[1]; + const size_t src2_row_size = src2->nb[1]; + const size_t src3_row_size = src3->nb[1]; - // Per-thread VTCM scratchpads for all tensors - // Note that the entire src1 tensor is already in VTCM - // For other tensors we allocate N rows per thread, padded to HVX vector size - uint8_t * restrict spad_dst = dst_spad->data + dst_spad->size_per_thread * ith; - uint8_t * restrict spad_src0 = src0_spad->data + src0_spad->size_per_thread * ith; - uint8_t * restrict src1_data = src1_spad->data; + const size_t src0_stride = mmctx->vtcm_src0_stride; + const size_t src2_stride = mmctx->vtcm_src2_stride; + const size_t src3_stride = mmctx->vtcm_src3_stride; + const size_t src1_stride = mmctx->vtcm_src1_stride; - for (uint32_t ie1 = 0; ie1 < n_aids; ++ie1) { // for each expert - const uint32_t eid = *(const int32_t *) ((const uint8_t *) src2->data + ie1 * src2->nb[0]); - assert(eid < n_ids); + uint8_t * restrict vtcm_src0_ptr = mmctx->vtcm_src0 + mmctx->vtcm_src0_size_per_thread * ith; + uint8_t * restrict vtcm_src2_ptr = mmctx->vtcm_src2 + mmctx->vtcm_src2_size_per_thread * ith; + uint8_t * restrict vtcm_src3_ptr = mmctx->vtcm_src3 + mmctx->vtcm_src3_size_per_thread * ith; + uint8_t * restrict src1_data = mmctx->vtcm_src1; - const uint8_t * restrict src0_row = (const uint8_t *) src0->data + eid * nb02; - const uint8_t * restrict src1_col = (const uint8_t *) src1_data; - float * restrict dst_row = (float *) (dst->data + ie1 * nb1); + dma_queue * dma_queue = octx->ctx->dma[ith]; - // Prefill spad with src0 rows - #pragma unroll(4) - for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) { - const int is0 = (ir0 - src0_start_row); - if (is0 >= MM_SPAD_SRC0_NROWS) { - break; - } - dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size), - src0_row_size_padded, src0_row_size, 2); - } + const struct htp_mm_kernel_params * kparams = (const struct htp_mm_kernel_params *) octx->kernel_params; + const uint32_t n_prefetch = kparams->n_prefetch; + assert(n_prefetch >= 2 && n_prefetch <= HTP_MM_MAX_PREFETCH && (n_prefetch & (n_prefetch - 1)) == 0); + const uint32_t prefetch_mask = n_prefetch - 1; - // Process src0 rows - for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) { - const uint8_t * ss0 = dma_queue_pop(dma_queue).dst; - mmctx->vec_dot_2x1(ne00, &dst_row[ir0], ss0, ss0 + src0_row_size_padded, src1_col); - - // Prefetch next (n + spad_nrows) row - const int pr0 = (ir0 + MM_SPAD_SRC0_NROWS); - const int is0 = (pr0 - src0_start_row) % MM_SPAD_SRC0_NROWS; - if (pr0 < src0_end_row_x2) { - dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size), - src0_row_size_padded, src0_row_size, 2); - } - } + const uint8_t * restrict src0_row = (const uint8_t *) src0->data; + const uint8_t * restrict src2_row = (const uint8_t *) src2->data; + const uint8_t * restrict src3_row = (const uint8_t *) src3->data; - // Process the last row (if any) - if (src0_end_row != src0_end_row_x2) { - uint32_t ir0 = src0_end_row_x2; - const uint32_t is0 = (ir0 - src0_start_row) % MM_SPAD_SRC0_NROWS; - dma_queue_push_ddr_to_vtcm(dma_queue, dma_make_ptr(spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size), - src0_row_size_padded, src0_row_size, 1); - const uint8_t * ss0 = dma_queue_pop(dma_queue).dst; - mmctx->vec_dot_1x1(ne00, &dst_row[ir0], ss0, src1_col); + // Prefill spad with src0, src2, src3 rows + for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) { + const int is0 = (ir0 - src0_start_row); + if (is0 >= (int)n_prefetch) { + break; } + dma_queue_push(dma_queue, dma_make_ptr(vtcm_src0_ptr + is0 * src0_stride, src0_row + ir0 * src0_row_size), + src0_stride, src0_row_size, src0_row_size, 2); + dma_queue_push(dma_queue, dma_make_ptr(vtcm_src2_ptr + is0 * src2_stride, src2_row + ir0 * src2_row_size), + src2_stride, src2_row_size, src2_row_size, 2); + dma_queue_push(dma_queue, dma_make_ptr(vtcm_src3_ptr + is0 * src3_stride, src3_row + ir0 * src3_row_size), + src3_stride, src3_row_size, src3_row_size, 2); } - t2 = HAP_perf_get_qtimer_count(); - - FARF(HIGH, "matvec-id-%s %d/%d: %ux%ux%ux%u (%u:%u) * %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u usec %u\n", mmctx->type, - ith, nth, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0_start_row, src0_end_row, src1->ne[0], - src1->ne[1], src1->ne[2], src1->ne[3], src2->ne[0], src2->ne[1], src2->ne[2], src2->ne[3], dst->ne[0], - dst->ne[1], dst->ne[2], dst->ne[3], (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); -} - -// *** dynamic quant - -static inline void quantize_block_f32_q8_1x1(float * restrict x, uint8_t * restrict y_q, uint8_t * restrict y_d) { - assert((unsigned long) x % 128 == 0); - assert((unsigned long) y_q % 128 == 0); - - HVX_Vector * vx = (HVX_Vector *) x; - HVX_Vector zero = Q6_V_vzero(); - - // Use reduce max fp32 to find max(abs(e)) first - HVX_Vector vmax0_sf = hvx_vec_reduce_max_f32(hvx_vec_abs_f32(vx[0])); - HVX_Vector vmax1_sf = hvx_vec_reduce_max_f32(hvx_vec_abs_f32(vx[1])); - HVX_Vector vmax2_sf = hvx_vec_reduce_max_f32(hvx_vec_abs_f32(vx[2])); - HVX_Vector vmax3_sf = hvx_vec_reduce_max_f32(hvx_vec_abs_f32(vx[3])); - - // Load and convert into QF32 - HVX_Vector vx0_qf = Q6_Vqf32_vsub_VsfVsf(vx[0], zero); // 32 elements - HVX_Vector vx1_qf = Q6_Vqf32_vsub_VsfVsf(vx[1], zero); // 32 elements - HVX_Vector vx2_qf = Q6_Vqf32_vsub_VsfVsf(vx[2], zero); // 32 elements - HVX_Vector vx3_qf = Q6_Vqf32_vsub_VsfVsf(vx[3], zero); // 32 elements - - // Convert to QF32 - HVX_Vector vmax0_qf = Q6_Vqf32_vsub_VsfVsf(vmax0_sf, zero); - HVX_Vector vmax1_qf = Q6_Vqf32_vsub_VsfVsf(vmax1_sf, zero); - HVX_Vector vmax2_qf = Q6_Vqf32_vsub_VsfVsf(vmax2_sf, zero); - HVX_Vector vmax3_qf = Q6_Vqf32_vsub_VsfVsf(vmax3_sf, zero); - - // Combine and convert to fp16 - HVX_Vector vmax01_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vmax1_qf, vmax0_qf))); - HVX_Vector vmax23_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vmax3_qf, vmax2_qf))); - - // Convert into fp16 - HVX_Vector vx01_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vx1_qf, vx0_qf))); - HVX_Vector vx23_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vx3_qf, vx2_qf))); - - HVX_Vector vd01_qf16 = Q6_Vqf16_vmpy_VhfVhf(vmax01_hf, Q6_Vh_vsplat_R(0x2008)); // 1.0 / 127.0 - HVX_Vector vd23_qf16 = Q6_Vqf16_vmpy_VhfVhf(vmax23_hf, Q6_Vh_vsplat_R(0x2008)); // 1.0 / 127.0 - HVX_Vector vd01_hf = Q6_Vhf_equals_Vqf16(vd01_qf16); - HVX_Vector vd23_hf = Q6_Vhf_equals_Vqf16(vd23_qf16); - - // Divide input by the scale - HVX_Vector vd01_inv_hf = hvx_vec_inverse_f16(vd01_hf); - HVX_Vector vd23_inv_hf = hvx_vec_inverse_f16(vd23_hf); - vx01_hf = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(vx01_hf, vd01_inv_hf)); - vx23_hf = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(vx23_hf, vd23_inv_hf)); - - // Convert to int8 - HVX_Vector vx01_i16 = hvx_vec_i16_from_hf_rnd_sat(vx01_hf); - HVX_Vector vx23_i16 = hvx_vec_i16_from_hf_rnd_sat(vx23_hf); - HVX_Vector vx_i8 = Q6_Vb_vpack_VhVh_sat(vx23_i16, vx01_i16); - - *(HVX_Vector *) y_q = vx_i8; - - // --- Sum calculation --- - const HVX_Vector ones = Q6_Vb_vsplat_R(1); - HVX_Vector v_sums = Q6_Vw_vrmpy_VbVb(vx_i8, ones); // sum every 4 consecutive elements - // Sum 8 elements: - v_sums = Q6_Vw_vadd_VwVw(v_sums, Q6_V_vror_VR(v_sums, 4)); - v_sums = Q6_Vw_vadd_VwVw(v_sums, Q6_V_vror_VR(v_sums, 8)); - v_sums = Q6_Vw_vadd_VwVw(v_sums, Q6_V_vror_VR(v_sums, 16)); - - // Copy to stack to extract sums and vmaxes - float vmax0[32] __attribute__((aligned(128))); - float vmax1[32] __attribute__((aligned(128))); - float vmax2[32] __attribute__((aligned(128))); - float vmax3[32] __attribute__((aligned(128))); - int32_t sums[32] __attribute__((aligned(128))); - - hvx_vec_store_u(vmax0, 128, vmax0_sf); - hvx_vec_store_u(vmax1, 128, vmax1_sf); - hvx_vec_store_u(vmax2, 128, vmax2_sf); - hvx_vec_store_u(vmax3, 128, vmax3_sf); - hvx_vec_store_u(sums, 128, v_sums); - - float d0 = vmax0[0] / 127.0f; - float d1 = vmax1[0] / 127.0f; - float d2 = vmax2[0] / 127.0f; - float d3 = vmax3[0] / 127.0f; - - __fp16 * y_d_half = (__fp16 *) y_d; - y_d_half[0] = d0; - y_d_half[1] = (float) sums[0] * d0; - y_d_half[2] = d1; - y_d_half[3] = (float) sums[8] * d1; - y_d_half[4] = d2; - y_d_half[5] = (float) sums[16] * d2; - y_d_half[6] = d3; - y_d_half[7] = (float) sums[24] * d3; -} + // Process rows + for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) { + const uint8_t * ss0 = dma_queue_pop(dma_queue).dst; + const uint8_t * ss2 = dma_queue_pop(dma_queue).dst; + const uint8_t * ss3 = dma_queue_pop(dma_queue).dst; -static inline void quantize_block_f32_q8x1(float * restrict x, uint8_t * restrict y_q, uint8_t * restrict y_d) { - assert((unsigned long) x % 128 == 0); - assert((unsigned long) y_q % 128 == 0); - - HVX_Vector * vx = (HVX_Vector *) x; - HVX_Vector zero = Q6_V_vzero(); - - // Use reduce max fp32 to find max(abs(e)) first - HVX_Vector vmax0_sf = hvx_vec_reduce_max_f32(hvx_vec_abs_f32(vx[0])); - HVX_Vector vmax1_sf = hvx_vec_reduce_max_f32(hvx_vec_abs_f32(vx[1])); - HVX_Vector vmax2_sf = hvx_vec_reduce_max_f32(hvx_vec_abs_f32(vx[2])); - HVX_Vector vmax3_sf = hvx_vec_reduce_max_f32(hvx_vec_abs_f32(vx[3])); - // Load and convert into QF32 - HVX_Vector vx0_qf = Q6_Vqf32_vsub_VsfVsf(vx[0], zero); // 32 elements - HVX_Vector vx1_qf = Q6_Vqf32_vsub_VsfVsf(vx[1], zero); // 32 elements - HVX_Vector vx2_qf = Q6_Vqf32_vsub_VsfVsf(vx[2], zero); // 32 elements - HVX_Vector vx3_qf = Q6_Vqf32_vsub_VsfVsf(vx[3], zero); // 32 elements - - // Convert to QF32 - HVX_Vector vmax0_qf = Q6_Vqf32_vsub_VsfVsf(vmax0_sf, zero); // replicated over all lanes - HVX_Vector vmax1_qf = Q6_Vqf32_vsub_VsfVsf(vmax1_sf, zero); // replicated over all lanes - HVX_Vector vmax2_qf = Q6_Vqf32_vsub_VsfVsf(vmax2_sf, zero); // replicated over all lanes - HVX_Vector vmax3_qf = Q6_Vqf32_vsub_VsfVsf(vmax3_sf, zero); // replicated over all lanes - - // Combine and convert to fp16 - HVX_Vector vmax01_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vmax1_qf, vmax0_qf))); - HVX_Vector vmax23_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vmax3_qf, vmax2_qf))); - - // Convert into fp16 - HVX_Vector vx01_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vx1_qf, vx0_qf))); - HVX_Vector vx23_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vx3_qf, vx2_qf))); - - HVX_Vector vd01_qf16 = Q6_Vqf16_vmpy_VhfVhf(vmax01_hf, Q6_Vh_vsplat_R(0x2008)); // 1.0 / 127.0 - HVX_Vector vd23_qf16 = Q6_Vqf16_vmpy_VhfVhf(vmax23_hf, Q6_Vh_vsplat_R(0x2008)); // 1.0 / 127.0 - HVX_Vector vd01_hf = Q6_Vhf_equals_Vqf16(vd01_qf16); - HVX_Vector vd23_hf = Q6_Vhf_equals_Vqf16(vd23_qf16); - - hvx_vec_store_u(y_d + 0, 2, vd01_hf); - HVX_Vector rotated_vd_hf = Q6_V_vror_VR(vd01_hf, 64); - hvx_vec_store_u(y_d + 2, 2, rotated_vd_hf); - - hvx_vec_store_u(y_d + 4, 2, vd23_hf); - rotated_vd_hf = Q6_V_vror_VR(vd23_hf, 64); - hvx_vec_store_u(y_d + 6, 2, rotated_vd_hf); - - // Divide input by the scale - HVX_Vector vd01_inv_hf = hvx_vec_inverse_f16(vd01_hf); - HVX_Vector vd23_inv_hf = hvx_vec_inverse_f16(vd23_hf); - vx01_hf = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(vx01_hf, vd01_inv_hf)); - vx23_hf = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(vx23_hf, vd23_inv_hf)); - - // Convert to int8 - HVX_Vector vx01_i16 = hvx_vec_i16_from_hf_rnd_sat(vx01_hf); - HVX_Vector vx23_i16 = hvx_vec_i16_from_hf_rnd_sat(vx23_hf); - HVX_Vector vx_i8 = Q6_Vb_vpack_VhVh_sat(vx23_i16, vx01_i16); - - *(HVX_Vector *) y_q = vx_i8; -} + // Process src1 columns in pairs (2×2 tiling) + uint32_t ir1 = 0; + for (; ir1 + 1 < src1_nrows; ir1 += 2) { + const uint8_t * restrict src1_col0 = (const uint8_t *) (src1_data + (ir1+0) * src1_stride); + const uint8_t * restrict src1_col1 = (const uint8_t *) (src1_data + (ir1+1) * src1_stride); -static inline void quantize_block_f32_q8x2(float * restrict x, uint8_t * restrict y_q, uint8_t * restrict y_d) { - assert((unsigned long) x % 128 == 0); - assert((unsigned long) y_q % 128 == 0); + float * restrict dst_row0_k = (float *) (dst_k->data + ((ir1+0) * dst_k_row_size)); + float * restrict dst_row1_k = (float *) (dst_k->data + ((ir1+1) * dst_k_row_size)); + mmctx->vec_dot_2x2(ne00, &dst_row0_k[ir0], &dst_row1_k[ir0], ss0, ss0 + src0_stride, src1_col0, src1_col1); - HVX_Vector * vx = (HVX_Vector *) x; + float * restrict dst_row0_v = (float *) (dst_v->data + ((ir1+0) * dst_k_row_size)); + float * restrict dst_row1_v = (float *) (dst_v->data + ((ir1+1) * dst_k_row_size)); + mmctx->vec_dot_2x2(ne00, &dst_row0_v[ir0], &dst_row1_v[ir0], ss2, ss2 + src2_stride, src1_col0, src1_col1); - // Load and convert into QF32 - HVX_Vector zero = Q6_V_vzero(); - HVX_Vector vx0_qf = Q6_Vqf32_vsub_VsfVsf(vx[0], zero); // 32 elements - HVX_Vector vx1_qf = Q6_Vqf32_vsub_VsfVsf(vx[1], zero); // 32 elements - HVX_Vector vx2_qf = Q6_Vqf32_vsub_VsfVsf(vx[2], zero); // 32 elements - HVX_Vector vx3_qf = Q6_Vqf32_vsub_VsfVsf(vx[3], zero); // 32 elements + float * restrict dst_row0_q = (float *) (dst_q->data + ((ir1+0) * dst_q_row_size)); + float * restrict dst_row1_q = (float *) (dst_q->data + ((ir1+1) * dst_q_row_size)); + mmctx->vec_dot_2x2(ne00, &dst_row0_q[ir0], &dst_row1_q[ir0], ss3, ss3 + src3_stride, src1_col0, src1_col1); + } - // Convert into fp16 - HVX_Vector vx01_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vx1_qf, vx0_qf))); - HVX_Vector vx23_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vx3_qf, vx2_qf))); + // Handle remaining src1 rows (fallback to 2×1) + for (; ir1 < src1_nrows; ++ir1) { + const uint8_t * restrict src1_col = (const uint8_t *) (src1_data + ir1 * src1_stride); - // Compute max and scale - HVX_Vector vmax01_hf = hvx_vec_reduce_max_f16(hvx_vec_abs_f16(vx01_hf)); // replicated over all lanes - HVX_Vector vmax23_hf = hvx_vec_reduce_max_f16(hvx_vec_abs_f16(vx23_hf)); // replicated over all lanes + float * restrict dst_row_k = (float *) (dst_k->data + (ir1 * dst_k_row_size)); + mmctx->vec_dot_2x1(ne00, &dst_row_k[ir0], ss0, ss0 + src0_stride, src1_col); - HVX_Vector vd01_qf16 = Q6_Vqf16_vmpy_VhfVhf(vmax01_hf, Q6_Vh_vsplat_R(0x2008)); // 1.0 / 127.0 - HVX_Vector vd23_qf16 = Q6_Vqf16_vmpy_VhfVhf(vmax23_hf, Q6_Vh_vsplat_R(0x2008)); // 1.0 / 127.0 - HVX_Vector vd01_hf = Q6_Vhf_equals_Vqf16(vd01_qf16); - HVX_Vector vd23_hf = Q6_Vhf_equals_Vqf16(vd23_qf16); + float * restrict dst_row_v = (float *) (dst_v->data + (ir1 * dst_k_row_size)); + mmctx->vec_dot_2x1(ne00, &dst_row_v[ir0], ss2, ss2 + src2_stride, src1_col); - hvx_vec_store_u(y_d + 0, 4, vd01_hf); - hvx_vec_store_u(y_d + 4, 4, vd23_hf); + float * restrict dst_row_q = (float *) (dst_q->data + (ir1 * dst_q_row_size)); + mmctx->vec_dot_2x1(ne00, &dst_row_q[ir0], ss3, ss3 + src3_stride, src1_col); + } - // Divide input by the scale - HVX_Vector vd01_inv_hf = hvx_vec_inverse_f16(vd01_hf); - HVX_Vector vd23_inv_hf = hvx_vec_inverse_f16(vd23_hf); - vx01_hf = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(vx01_hf, vd01_inv_hf)); - vx23_hf = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(vx23_hf, vd23_inv_hf)); + // Prefetch next (n + vtcm_nrows) rows + const int pr0 = (ir0 + n_prefetch); + const int is0 = (pr0 - src0_start_row) & prefetch_mask; + if (pr0 < src0_end_row_x2) { + dma_queue_push(dma_queue, dma_make_ptr(vtcm_src0_ptr + is0 * src0_stride, src0_row + pr0 * src0_row_size), + src0_stride, src0_row_size, src0_row_size, 2); + dma_queue_push(dma_queue, dma_make_ptr(vtcm_src2_ptr + is0 * src2_stride, src2_row + pr0 * src2_row_size), + src2_stride, src2_row_size, src2_row_size, 2); + dma_queue_push(dma_queue, dma_make_ptr(vtcm_src3_ptr + is0 * src3_stride, src3_row + pr0 * src3_row_size), + src3_stride, src3_row_size, src3_row_size, 2); + } + } - // Convert to int8 - HVX_Vector vx01_i16 = hvx_vec_i16_from_hf_rnd_sat(vx01_hf); - HVX_Vector vx23_i16 = hvx_vec_i16_from_hf_rnd_sat(vx23_hf); - HVX_Vector vx_i8 = Q6_Vb_vpack_VhVh_sat(vx23_i16, vx01_i16); + // Process last row (if any) + if (src0_end_row != src0_end_row_x2) { + uint32_t ir0 = src0_end_row_x2; + const int is0 = (ir0 - src0_start_row) & prefetch_mask; + dma_queue_push(dma_queue, dma_make_ptr(vtcm_src0_ptr + is0 * src0_stride, src0_row + ir0 * src0_row_size), + src0_stride, src0_row_size, src0_row_size, 1); + dma_queue_push(dma_queue, dma_make_ptr(vtcm_src2_ptr + is0 * src2_stride, src2_row + ir0 * src2_row_size), + src2_stride, src2_row_size, src2_row_size, 1); + dma_queue_push(dma_queue, dma_make_ptr(vtcm_src3_ptr + is0 * src3_stride, src3_row + ir0 * src3_row_size), + src3_stride, src3_row_size, src3_row_size, 1); - *(HVX_Vector *) y_q = vx_i8; -} + const uint8_t * ss0 = dma_queue_pop(dma_queue).dst; + const uint8_t * ss2 = dma_queue_pop(dma_queue).dst; + const uint8_t * ss3 = dma_queue_pop(dma_queue).dst; -static inline void quantize_block_f32_q8x4(float * restrict x, uint8_t * restrict y_q, uint8_t * restrict y_d) { - assert((unsigned long) x % 128 == 0); - assert((unsigned long) y_q % 128 == 0); + for (uint32_t ir1 = 0; ir1 < src1_nrows; ++ir1) { + const uint8_t * restrict src1_col = (const uint8_t *) (src1_data + ir1 * src1_stride); - HVX_Vector * vx = (HVX_Vector *) x; + float * restrict dst_row_k = (float *) (dst_k->data + (ir1 * dst_k_row_size)); + mmctx->vec_dot_1x1(ne00, &dst_row_k[ir0], ss0, src1_col); - // Load and convert into QF32 - HVX_Vector zero = Q6_V_vzero(); - HVX_Vector vx0_qf = Q6_Vqf32_vsub_VsfVsf(vx[0], zero); // 32 elements - HVX_Vector vx1_qf = Q6_Vqf32_vsub_VsfVsf(vx[1], zero); // 32 elements - HVX_Vector vx2_qf = Q6_Vqf32_vsub_VsfVsf(vx[2], zero); // 32 elements - HVX_Vector vx3_qf = Q6_Vqf32_vsub_VsfVsf(vx[3], zero); // 32 elements + float * restrict dst_row_v = (float *) (dst_v->data + (ir1 * dst_k_row_size)); + mmctx->vec_dot_1x1(ne00, &dst_row_v[ir0], ss2, src1_col); - // Convert into fp16 - HVX_Vector vx01_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vx1_qf, vx0_qf))); - HVX_Vector vx23_hf = Q6_Vh_vdeal_Vh(Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(vx3_qf, vx2_qf))); + float * restrict dst_row_q = (float *) (dst_q->data + (ir1 * dst_q_row_size)); + mmctx->vec_dot_1x1(ne00, &dst_row_q[ir0], ss3, src1_col); + } + } +} - // Compute max and scale - HVX_Vector vmax_hf = hvx_vec_reduce_max_f16(hvx_vec_abs_f16(vx01_hf)); - vmax_hf = hvx_vec_reduce_max2_f16(hvx_vec_abs_f16(vx23_hf), vmax_hf); // replicated over all lanes +static void hvx_mm_ffn_2d(unsigned int nth, unsigned int ith, void * data) { + struct htp_mm_context * mmctx = data; + struct htp_ops_context * octx = mmctx->octx; - HVX_Vector vd_qf16 = Q6_Vqf16_vmpy_VhfVhf(vmax_hf, Q6_Vh_vsplat_R(0x2008)); // 1.0 / 127.0 - HVX_Vector vd_hf = Q6_Vhf_equals_Vqf16(vd_qf16); + const struct htp_tensor * restrict src0 = octx->src[0]; // Wgate + const struct htp_tensor * restrict src1 = octx->src[1]; // y + const struct htp_tensor * restrict src2 = octx->src[2]; // Wup + const struct htp_tensor * restrict dst_gate = octx->dsts[0]; + const struct htp_tensor * restrict dst_up = octx->dsts[1]; - *(HVX_UVector *) y_d = vd_hf; + const uint32_t ne00 = src0->ne[0]; + const uint32_t ne01 = src0->ne[1]; + const uint32_t ne02 = src0->ne[2]; + const uint32_t ne03 = src0->ne[3]; - // Divide input by the scale - HVX_Vector vd_inv_hf = hvx_vec_inverse_f16(vd_hf); - vx01_hf = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(vx01_hf, vd_inv_hf)); - vx23_hf = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(vx23_hf, vd_inv_hf)); + const uint32_t ne11 = src1->ne[1]; + const uint32_t ne12 = src1->ne[2]; + const uint32_t ne13 = src1->ne[3]; - // Convert to int8 - HVX_Vector vx01_i16 = hvx_vec_i16_from_hf_rnd_sat(vx01_hf); - HVX_Vector vx23_i16 = hvx_vec_i16_from_hf_rnd_sat(vx23_hf); - HVX_Vector vx_i8 = Q6_Vb_vpack_VhVh_sat(vx23_i16, vx01_i16); + const uint32_t src0_nrows = ne01 * ne02 * ne03; + const uint32_t src1_nrows = ne11 * ne12 * ne13; - *(HVX_Vector *) y_q = vx_i8; -} + const uint32_t src0_nrows_per_thread = mmctx->src0_nrows_per_thread; + const uint32_t src0_start_row = src0_nrows_per_thread * ith; + const uint32_t src0_end_row = MIN(src0_start_row + src0_nrows_per_thread, src0_nrows); + const uint32_t src0_end_row_x2 = src0_start_row + ((src0_end_row - src0_start_row) & ~1U); -// Overrides input x -static void quantize_row_f32_q8x4x2(float * restrict x, uint8_t * restrict y, uint32_t k) { - assert(k % 32 == 0); - const uint32_t qk = QK_Q8_0x4x2; - const uint32_t nb = (k + qk - 1) / qk; - - const uint32_t qrow_size = k; // int8 - - const uint32_t dblk_size = 8 * 2; // 8x __fp16 - const uint32_t qblk_size = QK_Q8_0x4x2; // int8 - - uint8_t * restrict y_q = (y + 0); // quants first - uint8_t * restrict y_d = (y + qrow_size); // then scales - - // Temp scales override input since we're working off of the aligned temp buffer in VTCM - uint8_t * restrict t_d = (uint8_t *) x; - - for (uint32_t i = 0; i < nb; i++) { -#if FP32_QUANTIZE_GROUP_SIZE == 32 - quantize_block_f32_q8x1(x + (i*2 + 0) * qk/2, y_q + (i*2 + 0) * qblk_size/2, t_d + (i*2 + 0) * dblk_size/2); - quantize_block_f32_q8x1(x + (i*2 + 1) * qk/2, y_q + (i*2 + 1) * qblk_size/2, t_d + (i*2 + 1) * dblk_size/2); -#elif FP32_QUANTIZE_GROUP_SIZE == 64 - quantize_block_f32_q8x2(x + (i*2 + 0) * qk/2, y_q + (i*2 + 0) * qblk_size/2, t_d + (i*2 + 0) * dblk_size/2); - quantize_block_f32_q8x2(x + (i*2 + 1) * qk/2, y_q + (i*2 + 1) * qblk_size/2, t_d + (i*2 + 1) * dblk_size/2); -#elif FP32_QUANTIZE_GROUP_SIZE == 128 - quantize_block_f32_q8x4(x + (i*2 + 0) * qk/2, y_q + (i*2 + 0) * qblk_size/2, t_d + (i*2 + 0) * dblk_size/2); - quantize_block_f32_q8x4(x + (i*2 + 1) * qk/2, y_q + (i*2 + 1) * qblk_size/2, t_d + (i*2 + 1) * dblk_size/2); -#else -#error "FP32_QUANTIZE_GROUP_SIZE must be 32, 64, or 128" -#endif + if (src0_start_row >= src0_end_row) { + return; } - // now copy the scales into final location - hvx_copy_f16_ua(y_d, t_d, nb * 8); -} + const size_t dst_row_size = dst_gate->nb[1]; + const size_t src0_row_size = src0->nb[1]; + const size_t src2_row_size = src2->nb[1]; -static void quantize_f32_q8x4x2(unsigned int nth, unsigned int ith, void * data) { - struct htp_matmul_context * mmctx = data; - struct htp_ops_context * octx = mmctx->octx; + const size_t src0_stride = mmctx->vtcm_src0_stride; + const size_t src2_stride = mmctx->vtcm_src2_stride; + const size_t src1_stride = mmctx->vtcm_src1_stride; - const struct htp_tensor * src = octx->src[1]; - uint8_t * restrict dst = octx->src1_spad.data; - struct htp_spad * spad = &octx->src0_spad; - uint32_t nrows_per_thread = mmctx->src1_nrows_per_thread; + uint8_t * restrict vtcm_src0_ptr = mmctx->vtcm_src0 + mmctx->vtcm_src0_size_per_thread * ith; + uint8_t * restrict vtcm_src2_ptr = mmctx->vtcm_src2 + mmctx->vtcm_src2_size_per_thread * ith; + uint8_t * restrict src1_data = mmctx->vtcm_src1; - uint64_t t1 = HAP_perf_get_qtimer_count(); + dma_queue * dma_queue = octx->ctx->dma[ith]; - const uint32_t ne0 = src->ne[0]; - const uint32_t ne1 = src->ne[1]; - const uint32_t ne2 = src->ne[2]; - const uint32_t ne3 = src->ne[3]; + const struct htp_mm_kernel_params * kparams = (const struct htp_mm_kernel_params *) octx->kernel_params; + const uint32_t n_prefetch = kparams->n_prefetch; + assert(n_prefetch >= 2 && n_prefetch <= HTP_MM_MAX_PREFETCH && (n_prefetch & (n_prefetch - 1)) == 0); + const uint32_t prefetch_mask = n_prefetch - 1; - const uint32_t nrows = ne1 * ne2 * ne3; // total n_rows + const uint8_t * restrict src0_row = (const uint8_t *) src0->data; + const uint8_t * restrict src2_row = (const uint8_t *) src2->data; - const uint32_t ir_first = nrows_per_thread * ith; // first row - const uint32_t ir_last = MIN(ir_first + nrows_per_thread, nrows); // last row + // Prefill spad with src0, src2 rows + for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) { + const int is0 = (ir0 - src0_start_row); + if (is0 >= (int)n_prefetch) { + break; + } + dma_queue_push(dma_queue, dma_make_ptr(vtcm_src0_ptr + is0 * src0_stride, src0_row + ir0 * src0_row_size), + src0_stride, src0_row_size, src0_row_size, 2); + dma_queue_push(dma_queue, dma_make_ptr(vtcm_src2_ptr + is0 * src2_stride, src2_row + ir0 * src2_row_size), + src2_stride, src2_row_size, src2_row_size, 2); + } - const size_t src_row_size = src->nb[1]; - const size_t dst_row_size = q8x4x2_row_size(ne0); + // Process rows + for (uint32_t ir0 = src0_start_row; ir0 < src0_end_row_x2; ir0 += 2) { + const uint8_t * ss0 = dma_queue_pop(dma_queue).dst; + const uint8_t * ss2 = dma_queue_pop(dma_queue).dst; - uint8_t * restrict src_data = (uint8_t *) src->data + (src_row_size * ir_first); - uint8_t * restrict dst_data = (uint8_t *) dst + (dst_row_size * ir_first); - uint8_t * restrict tmp_data = (uint8_t *) spad->data + (spad->size_per_thread * ith); + // Process src1 columns in pairs (2×2 tiling) + uint32_t ir1 = 0; + for (; ir1 + 1 < src1_nrows; ir1 += 2) { + const uint8_t * restrict src1_col0 = (const uint8_t *) (src1_data + (ir1+0) * src1_stride); + const uint8_t * restrict src1_col1 = (const uint8_t *) (src1_data + (ir1+1) * src1_stride); - const size_t src_row_size_padded = hex_round_up(src_row_size, QK_Q8_0x4x2 * sizeof(float)); - memset(tmp_data, 0, src_row_size_padded); // zero-out temp row data for padding + float * restrict dst_row0_gate = (float *) (dst_gate->data + ((ir1+0) * dst_row_size)); + float * restrict dst_row1_gate = (float *) (dst_gate->data + ((ir1+1) * dst_row_size)); + mmctx->vec_dot_2x2(ne00, &dst_row0_gate[ir0], &dst_row1_gate[ir0], ss0, ss0 + src0_stride, src1_col0, src1_col1); - for (uint32_t i = ir_first; i < ir_last; ++i) { - hex_l2fetch(src_data, src_row_size, src_row_size, 2); - hvx_copy_f32_aa(tmp_data, src_data, ne0); + float * restrict dst_row0_up = (float *) (dst_up->data + ((ir1+0) * dst_row_size)); + float * restrict dst_row1_up = (float *) (dst_up->data + ((ir1+1) * dst_row_size)); + mmctx->vec_dot_2x2(ne00, &dst_row0_up[ir0], &dst_row1_up[ir0], ss2, ss2 + src2_stride, src1_col0, src1_col1); + } - // FARF(HIGH, "quantize-q8x4-row: %u\n", i); - quantize_row_f32_q8x4x2((float *) tmp_data, dst_data, ne0); - dst_data += dst_row_size; - src_data += src_row_size; - } + // Handle remaining src1 rows (fallback to 2×1) + for (; ir1 < src1_nrows; ++ir1) { + const uint8_t * restrict src1_col = (const uint8_t *) (src1_data + ir1 * src1_stride); - uint64_t t2 = HAP_perf_get_qtimer_count(); + float * restrict dst_row_gate = (float *) (dst_gate->data + (ir1 * dst_row_size)); + mmctx->vec_dot_2x1(ne00, &dst_row_gate[ir0], ss0, ss0 + src0_stride, src1_col); - FARF(HIGH, "quantize-f32-q8x4: %u/%u : n-rows %u (%u:%u) row-size %u -> %u usec %u\n", ith, nth, nrows, ir_first, - ir_last, src_row_size, dst_row_size, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); -} + float * restrict dst_row_up = (float *) (dst_up->data + (ir1 * dst_row_size)); + mmctx->vec_dot_2x1(ne00, &dst_row_up[ir0], ss2, ss2 + src2_stride, src1_col); + } -static void quantize_row_f32_q8_1x4x2(float * restrict x, uint8_t * restrict y, uint32_t k) { - assert(k % 32 == 0); - const uint32_t qk = QK_Q8_0x4x2; - const uint32_t nb = (k + qk - 1) / qk; + // Prefetch next rows + const int pr0 = (ir0 + n_prefetch); + const int is0 = (pr0 - src0_start_row) & prefetch_mask; + if (pr0 < src0_end_row_x2) { + dma_queue_push(dma_queue, dma_make_ptr(vtcm_src0_ptr + is0 * src0_stride, src0_row + pr0 * src0_row_size), + src0_stride, src0_row_size, src0_row_size, 2); + dma_queue_push(dma_queue, dma_make_ptr(vtcm_src2_ptr + is0 * src2_stride, src2_row + pr0 * src2_row_size), + src2_stride, src2_row_size, src2_row_size, 2); + } + } - const uint32_t qrow_size = k; // int8 + // Process last row (if any) + if (src0_end_row != src0_end_row_x2) { + uint32_t ir0 = src0_end_row_x2; + const int is0 = (ir0 - src0_start_row) & prefetch_mask; + dma_queue_push(dma_queue, dma_make_ptr(vtcm_src0_ptr + is0 * src0_stride, src0_row + ir0 * src0_row_size), + src0_stride, src0_row_size, src0_row_size, 1); + dma_queue_push(dma_queue, dma_make_ptr(vtcm_src2_ptr + is0 * src2_stride, src2_row + ir0 * src2_row_size), + src2_stride, src2_row_size, src2_row_size, 1); - const uint32_t dblk_size = 8 * 4; // 8x (d, s) __fp16 = 32 bytes - const uint32_t qblk_size = QK_Q8_0x4x2; // int8 + const uint8_t * ss0 = dma_queue_pop(dma_queue).dst; + const uint8_t * ss2 = dma_queue_pop(dma_queue).dst; - uint8_t * restrict y_q = (y + 0); // quants first - uint8_t * restrict y_d = (y + qrow_size); // then scales/sums + for (uint32_t ir1 = 0; ir1 < src1_nrows; ++ir1) { + const uint8_t * restrict src1_col = (const uint8_t *) (src1_data + ir1 * src1_stride); - // Temp scales override input since we're working off of the aligned temp buffer in VTCM - uint8_t * restrict t_d = (uint8_t *) x; + float * restrict dst_row_gate = (float *) (dst_gate->data + (ir1 * dst_row_size)); + mmctx->vec_dot_1x1(ne00, &dst_row_gate[ir0], ss0, src1_col); - for (uint32_t i = 0; i < nb; i++) { - quantize_block_f32_q8_1x1(x + (i*2 + 0) * qk/2, y_q + (i*2 + 0) * qblk_size/2, t_d + (i*2 + 0) * dblk_size/2); - quantize_block_f32_q8_1x1(x + (i*2 + 1) * qk/2, y_q + (i*2 + 1) * qblk_size/2, t_d + (i*2 + 1) * dblk_size/2); + float * restrict dst_row_up = (float *) (dst_up->data + (ir1 * dst_row_size)); + mmctx->vec_dot_1x1(ne00, &dst_row_up[ir0], ss2, src1_col); + } } +} - // now copy the scales/sums into final location - hvx_copy_f16_ua(y_d, t_d, nb * 16); +#define DEQUANTIZE_WORKER_LOOP_IMPL(SUFFIX) \ +static void dequantize_tiled_worker_loop_##SUFFIX(unsigned int n, unsigned int i, void *data) { \ + tiled_dequantize_state_t *state = (tiled_dequantize_state_t *)data; \ + struct htp_thread_trace * tr = state->traces ? &state->traces[i] : NULL; \ + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_W_DEQUANT, i); \ + for (unsigned int task_id = i; task_id < (unsigned int)state->n_tasks; task_id += n) { \ + int start = task_id * state->n_tiles_per_task; \ + int end = hex_smin(start + state->n_tiles_per_task, state->n_tot_tiles); \ + dequantize_tiled_weight_to_fp16_task_##SUFFIX(state, start, end); \ + } \ + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_W_DEQUANT, i); \ } -static void quantize_f32_q8_1x4x2(unsigned int nth, unsigned int ith, void * data) { - struct htp_matmul_context * mmctx = data; - struct htp_ops_context * octx = mmctx->octx; +DEQUANTIZE_WORKER_LOOP_IMPL(q4_0) +DEQUANTIZE_WORKER_LOOP_IMPL(q4_1) +DEQUANTIZE_WORKER_LOOP_IMPL(iq4_nl) +DEQUANTIZE_WORKER_LOOP_IMPL(mxfp4) +DEQUANTIZE_WORKER_LOOP_IMPL(q8_0) - const struct htp_tensor * src = octx->src[1]; - uint8_t * restrict dst = octx->src1_spad.data; - struct htp_spad * spad = &octx->src0_spad; - uint32_t nrows_per_thread = mmctx->src1_nrows_per_thread; +static void convert_f16_worker_loop(unsigned int n, unsigned int i, void *data) { + tiled_dequantize_state_t *state = (tiled_dequantize_state_t *)data; + struct htp_thread_trace * tr = state->traces ? &state->traces[i] : NULL; + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_W_DEQUANT, i); + for (unsigned int task_id = i; task_id < (unsigned int)state->n_tasks; task_id += n) { + int start = task_id * state->n_tiles_per_task; + int end = hex_smin(start + state->n_tiles_per_task, state->n_tot_tiles); + convert_f16_weight_to_fp16_tiles_task(state, start, end); + } + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_W_DEQUANT, i); +} - uint64_t t1 = HAP_perf_get_qtimer_count(); +static void quantize_f32_worker_loop(unsigned int n, unsigned int i, void *data) { + tiled_dequantize_state_t *state = (tiled_dequantize_state_t *)data; - const uint32_t ne0 = src->ne[0]; - const uint32_t ne1 = src->ne[1]; - const uint32_t ne2 = src->ne[2]; - const uint32_t ne3 = src->ne[3]; + struct htp_thread_trace * tr = state->traces ? &state->traces[i] : NULL; + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_A_QUANT, i); - const uint32_t nrows = ne1 * ne2 * ne3; // total n_rows + for (unsigned int task_id = i; task_id < (unsigned int)state->n_tasks; task_id += n) { + int start = task_id * state->n_tiles_per_task; + int end = hex_smin(start + state->n_tiles_per_task, state->n_tot_tiles); + quantize_f32_weight_to_fp16_tiles_task(state, start, end); + } - const uint32_t ir_first = nrows_per_thread * ith; // first row - const uint32_t ir_last = MIN(ir_first + nrows_per_thread, nrows); // last row + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_A_QUANT, i); +} - const size_t src_row_size = src->nb[1]; - const size_t dst_row_size = q8_1x4x2_row_size(ne0); +static void transfer_output_chunk_worker_fn(unsigned int n, unsigned int i, void *data) { + output_transfer_task_state_t *st = (output_transfer_task_state_t *) data; - uint8_t * restrict src_data = (uint8_t *) src->data + (src_row_size * ir_first); - uint8_t * restrict dst_data = (uint8_t *) dst + (dst_row_size * ir_first); - uint8_t * restrict tmp_data = (uint8_t *) spad->data + (spad->size_per_thread * ith); + struct htp_thread_trace * tr = st->traces ? &st->traces[i] : NULL; - const size_t src_row_size_padded = hex_round_up(src_row_size, QK_Q8_0x4x2 * sizeof(float)); - memset(tmp_data, 0, src_row_size_padded); // zero-out temp row data for padding + int start_chunk_idx = i * st->n_chunks_per_task; + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_O_PROC, start_chunk_idx); - for (uint32_t i = ir_first; i < ir_last; ++i) { - hex_l2fetch(src_data, src_row_size, src_row_size, 2); - hvx_copy_f32_aa(tmp_data, src_data, ne0); + for (unsigned int task_id = i; task_id < (unsigned int)st->n_tasks; task_id += n) { + int chunk_idx = task_id * st->n_chunks_per_task; + size_t chunk_size = hex_smin(st->n_tot_chunks - chunk_idx, st->n_chunks_per_task); - quantize_row_f32_q8_1x4x2((float *) tmp_data, dst_data, ne0); - dst_data += dst_row_size; - src_data += src_row_size; + float *dst = st->dst + chunk_idx * st->dst_stride; + transfer_output_chunk_fp16_to_fp32(dst, st->vtcm_src, chunk_idx, chunk_size, st->n_cols, st->dst_stride, st->dst_cols); } - uint64_t t2 = HAP_perf_get_qtimer_count(); - - FARF(HIGH, "quantize-f32-q8_1x4: %u/%u : n-rows %u (%u:%u) row-size %u -> %u usec %u\n", ith, nth, nrows, ir_first, - ir_last, src_row_size, dst_row_size, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_O_PROC, start_chunk_idx); } -static void quantize_f32_f32(unsigned int nth, unsigned int ith, void * data) { - struct htp_matmul_context * mmctx = data; - struct htp_ops_context * octx = mmctx->octx; +static void transfer_activation_chunk_worker_fn(unsigned int n, unsigned int i, void *data) { + activation_transfer_task_state_t *st = (activation_transfer_task_state_t *) data; - const struct htp_tensor * src = octx->src[1]; - uint8_t * restrict dst = octx->src1_spad.data; - uint32_t nrows_per_thread = mmctx->src1_nrows_per_thread; - uint32_t dst_stride = octx->src1_spad.stride; + struct htp_thread_trace * tr = st->traces ? &st->traces[i] : NULL; - uint64_t t1 = HAP_perf_get_qtimer_count(); + int start_chunk_idx = i * st->n_chunks_per_task; + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_A_PREP, start_chunk_idx); - const uint32_t ne0 = src->ne[0]; - const uint32_t ne1 = src->ne[1]; - const uint32_t ne2 = src->ne[2]; - const uint32_t ne3 = src->ne[3]; + for (unsigned int task_id = i; task_id < (unsigned int)st->n_tasks; task_id += n) { + int chunk_idx = task_id * st->n_chunks_per_task; + size_t chunk_size = hex_smin(st->n_tot_chunks - chunk_idx, st->n_chunks_per_task); - const uint32_t nrows = ne1 * ne2 * ne3; // total n_rows + __fp16 *dst = st->dst + chunk_idx * st->k_block; + const float *src = st->src + chunk_idx * st->k_stride; - const uint32_t ir_first = nrows_per_thread * ith; // first row - const uint32_t ir_last = MIN(ir_first + nrows_per_thread, nrows); // last row + if (st->vtcm_f32_act) { + float *thread_f32_act = st->vtcm_f32_act + i * HTP_MM_DMA_ACT_MULTIPLIER * st->k_block; + transfer_activation_chunk_fp32_to_fp16_dma_pipelined( + st->ctx->dma[i], dst, src, chunk_size, st->k_block, st->k_stride, st->k_valid, thread_f32_act + ); + } else { + transfer_activation_chunk_fp32_to_fp16(dst, src, chunk_size, st->k_block, st->k_stride, st->k_valid); + } + } - const size_t src_row_size = ne0 * sizeof(float); - const size_t src_stride = src->nb[1]; + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_A_PREP, start_chunk_idx); +} + +static void transfer_activation_chunk_gathered_worker_fn(unsigned int n, unsigned int i, void *data) { + activation_transfer_gathered_task_state_t *st = data; + struct htp_thread_trace * tr = st->traces ? &st->traces[i] : NULL; + int chunk_idx = i; + int chunk_size = st->n_chunks_per_task; + int start_row = st->start_row + chunk_idx * chunk_size; + int n_rows = hex_smin(st->cne1 - start_row, chunk_size); + if (n_rows > 0) { + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_A_PREP, chunk_idx); + transfer_activation_chunk_fp32_to_fp16_gathered( + st->dst, st->src, start_row, n_rows, st->k_block, + st->matrix_rows, st->cur_a, st->mapping_stride, + st->ne11, &st->ne11_div, st->nb11, st->nb12, st->cne1, st->k_valid); + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_A_PREP, chunk_idx); + } +} + +static void transfer_activation_chunk_gathered_worker_flat_fn(unsigned int n, unsigned int i, void *data) { + activation_transfer_gathered_task_state_t *st = data; + struct htp_thread_trace * tr = st->traces ? &st->traces[i] : NULL; + int chunk_idx = i; + int chunk_size = st->n_chunks_per_task; + int start_row = st->start_row + chunk_idx * chunk_size; + int n_rows = hex_smin(st->cne1 - start_row, chunk_size); + if (n_rows > 0) { + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_A_PREP, chunk_idx); + transfer_activation_chunk_fp32_to_fp16_gathered_flat( + st->dst, st->src, start_row, n_rows, st->k_block, + st->matrix_rows, st->cur_a, st->mapping_stride, + st->nb12, st->cne1, st->k_valid); + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_A_PREP, chunk_idx); + } +} + +static void transfer_output_chunk_scattered_worker_fn(unsigned int n, unsigned int i, void *data) { + output_transfer_scattered_task_state_t *st = data; + struct htp_thread_trace * tr = st->traces ? &st->traces[i] : NULL; + int chunk_idx = i; + int chunk_size = st->n_chunks_per_task; + int start_row = st->start_row + chunk_idx * chunk_size; + int n_rows = hex_smin(st->cne1 - start_row, chunk_size); + if (n_rows > 0) { + htp_trace_event_start(tr, HTP_TRACE_EVT_HVX_O_PROC, chunk_idx); + transfer_output_chunk_fp16_to_fp32_scattered( + st->dst, st->vtcm_src, start_row, n_rows, st->n_cols, + st->matrix_rows, st->cur_a, st->mapping_stride, + st->dst_nb1, st->dst_nb2, st->cne1); + htp_trace_event_stop(tr, HTP_TRACE_EVT_HVX_O_PROC, chunk_idx); + } +} + +// --- HMX Dispatchers & Entry Points --- + +static void dequantize_tiled_weight_chunk_to_fp16_tiles( + struct htp_context *ctx, __fp16 *vtcm_dst, + const void *weight_src_ddr, + int n_cols, int k_block, + size_t row_stride, int weight_type, + int n_k_tiles, struct fastdiv_values n_k_tiles_div, + worker_callback_t dequant_worker_fn, int n_threads) { + + assert(n_cols % HTP_MM_HMX_TILE_N_COLS == 0); + assert(k_block % HTP_MM_HMX_TILE_N_COLS == 0); + + size_t n_col_tiles = n_cols / HTP_MM_HMX_TILE_N_COLS; + size_t n_tot_tiles = n_col_tiles * n_k_tiles; + + size_t n_tiles_per_task = (n_threads == 1) ? n_tot_tiles : hmx_ceil_div(n_tot_tiles, n_threads); + + tiled_dequantize_state_t state; + state.n_tasks = (n_tot_tiles + n_tiles_per_task - 1) / n_tiles_per_task; + state.n_tot_tiles = n_tot_tiles; + state.n_tiles_per_task = n_tiles_per_task; + state.dst = vtcm_dst; + state.src = (const uint8_t *)weight_src_ddr; + state.n_cols = n_cols; + state.k_block = k_block; + state.row_stride = row_stride; + state.weight_type = weight_type; + state.n_k_tiles = n_k_tiles; + state.n_k_tiles_div = n_k_tiles_div; + state.traces = ctx->trace; + state.ctx = ctx; + + state.tile_size = htp_mm_get_weight_tile_size(weight_type); + state.aligned_tile_size = htp_mm_get_weight_aligned_tile_size(weight_type); + + if (state.n_tasks == 1 || n_threads == 1) { + dequant_worker_fn(1, 0, &state); + } else { + int n_tasks = hex_smin((int) state.n_tasks, n_threads); + worker_pool_run_func(ctx->worker_pool, dequant_worker_fn, &state, n_tasks); + } +} - uint8_t * restrict src_data = (uint8_t *) src->data + (src_stride * ir_first); - uint8_t * restrict dst_data = (uint8_t *) dst + (dst_stride * ir_first); +static void transfer_output_chunk_threaded(struct htp_context *ctx, float *dst, const __fp16 *vtcm_src, + int n_rows, int n_cols, int dst_stride, int dst_cols, int n_threads) { + assert(n_cols % HTP_MM_HMX_TILE_N_COLS == 0); - for (uint32_t i = ir_first; i < ir_last; ++i) { - hex_l2fetch(src_data, src_row_size, src_stride, 2); - hvx_copy_f32_au(dst_data, src_data, ne0); + if (n_rows <= 0) return; - dst_data += dst_stride; - src_data += src_stride; - } + size_t n_tot_chunks = n_rows; + size_t n_chunks_per_task = (n_threads == 1) ? n_tot_chunks : hmx_ceil_div(n_rows, n_threads); + n_chunks_per_task = hex_align_up(n_chunks_per_task, 2); - uint64_t t2 = HAP_perf_get_qtimer_count(); + int actual_threads = hmx_ceil_div(n_rows, n_chunks_per_task); - FARF(HIGH, "quantize-f32-f32: %u/%u : n-rows %u (%u:%u) row-size %u (%u) -> %u usec %u\n", ith, nth, nrows, ir_first, - ir_last, src_row_size, src_stride, dst_stride, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); -} + output_transfer_task_state_t state; + state.n_tasks = actual_threads; + state.n_tot_chunks = n_tot_chunks; + state.n_chunks_per_task = n_chunks_per_task; + state.dst = dst; + state.vtcm_src = vtcm_src; + state.n_cols = n_cols; + state.dst_stride = dst_stride; + state.dst_cols = dst_cols; + state.traces = ctx->trace; -static void quantize_f32_f16(unsigned int nth, unsigned int ith, void * data) { - struct htp_matmul_context * mmctx = data; - struct htp_ops_context * octx = mmctx->octx; + if (actual_threads <= 1) { + transfer_output_chunk_worker_fn(1, 0, &state); + } else { + worker_pool_run_func(ctx->worker_pool, transfer_output_chunk_worker_fn, &state, actual_threads); + } +} + +static void transfer_activation_chunk_threaded( + struct htp_context *ctx, + __fp16 *dst, + const float *src, + int n_rows, + int k_block, + int k_stride, + int n_threads, + int k_valid, + float *vtcm_f32_act) { + assert(k_block % HTP_MM_HMX_TILE_N_COLS == 0 && k_stride % HTP_MM_HMX_TILE_N_COLS == 0); + + size_t n_tot_chunks = n_rows; + size_t n_chunks_per_task = (n_threads == 1) ? n_tot_chunks : 32; // must be multiple of 32 to ensure correct destination address + + activation_transfer_task_state_t state; + state.n_tasks = (n_tot_chunks + n_chunks_per_task - 1) / n_chunks_per_task; + state.n_tot_chunks = n_tot_chunks; + state.n_chunks_per_task = n_chunks_per_task; + state.dst = dst; + state.src = src; + state.k_block = k_block; + state.k_stride = k_stride; + state.k_valid = k_valid; + state.traces = ctx->trace; + state.ctx = ctx; + state.vtcm_f32_act = vtcm_f32_act; + + if (state.n_tasks == 1 || n_threads == 1) { + transfer_activation_chunk_worker_fn(1, 0, &state); + } else { + int n_tasks = hex_smin((int) state.n_tasks, n_threads); + worker_pool_run_func(ctx->worker_pool, transfer_activation_chunk_worker_fn, &state, n_tasks); + } +} + +static int hmx_mm_2d_f32(struct htp_context *ctx, + float *restrict dst, + const float *activation, + const uint8_t *weight, + int m, int k, int n, + int act_stride, + int weight_stride, + int weight_type, + int k_valid, + int dst_stride, + int dst_cols, + int m_chunk, + int n_chunk, + int pipeline, + int n_threads, + int act_threads, + int tile_size, + int aligned_tile_size, + int vtcm_size) { + if (k % 32 != 0 || n % 32 != 0) { return -1; } + if (!hex_is_aligned(dst, VLEN) || !hex_is_aligned(activation, VLEN)) { return -1; } + + size_t row_stride = htp_mm_get_tiled_row_stride(weight_type, k); + if (row_stride == 0) { + return -1; + } + + worker_callback_t dequant_worker_fn = NULL; + switch (weight_type) { + case HTP_TYPE_Q4_0: dequant_worker_fn = dequantize_tiled_worker_loop_q4_0; break; + case HTP_TYPE_IQ4_NL: dequant_worker_fn = dequantize_tiled_worker_loop_iq4_nl; break; + case HTP_TYPE_Q4_1: dequant_worker_fn = dequantize_tiled_worker_loop_q4_1; break; + case HTP_TYPE_MXFP4: dequant_worker_fn = dequantize_tiled_worker_loop_mxfp4; break; + case HTP_TYPE_Q8_0: dequant_worker_fn = dequantize_tiled_worker_loop_q8_0; break; + case HTP_TYPE_F16: dequant_worker_fn = convert_f16_worker_loop; break; + case HTP_TYPE_F32: dequant_worker_fn = quantize_f32_worker_loop; break; + default: + return -1; + } - const struct htp_tensor * src = octx->src[1]; - uint8_t * restrict dst = octx->src1_spad.data; - uint32_t nrows_per_thread = mmctx->src1_nrows_per_thread; - uint32_t dst_stride = octx->src1_spad.stride; + const int n_k_tiles = k / HTP_MM_HMX_TILE_N_COLS; + const struct fastdiv_values n_k_tiles_div = init_fastdiv_values(n_k_tiles); - uint64_t t1 = HAP_perf_get_qtimer_count(); + const bool is_quant = (weight_type != HTP_TYPE_F16 && weight_type != HTP_TYPE_F32); + const size_t vec_dot_size = k * sizeof(__fp16); + const size_t vtcm_budget = ctx->vtcm_size; - const uint32_t ne0 = src->ne[0]; - const uint32_t ne1 = src->ne[1]; - const uint32_t ne2 = src->ne[2]; - const uint32_t ne3 = src->ne[3]; + size_t m_chunk_n_rows = m_chunk; + size_t n_chunk_n_cols = n_chunk; + size_t vtcm_used = vtcm_size; - const uint32_t nrows = ne1 * ne2 * ne3; // total n_rows + const size_t qweight_row_stride = is_quant ? (size_t)(n_k_tiles * aligned_tile_size) / 32 : 0; - const uint32_t ir_first = nrows_per_thread * ith; // first row - const uint32_t ir_last = MIN(ir_first + nrows_per_thread, nrows); // last row + const size_t act_f32_size = hex_align_up((size_t)act_threads * HTP_MM_DMA_ACT_MULTIPLIER * k * sizeof(float), HTP_MM_HMX_TILE_SIZE); - const size_t src_row_size = ne0 * sizeof(float); - const size_t src_stride = src->nb[1]; + const size_t weight_area_size = is_quant + ? hex_align_up((n_chunk_n_cols / 32) * n_k_tiles * aligned_tile_size, HTP_MM_HMX_TILE_SIZE) + : hex_align_up(n_chunk_n_cols * row_stride, HTP_MM_HMX_TILE_SIZE); + const size_t act_area_size = hex_align_up(m_chunk_n_rows * vec_dot_size, HTP_MM_HMX_TILE_SIZE); + const size_t output_area_size = hex_align_up(m_chunk_n_rows * n_chunk_n_cols * sizeof(__fp16), HTP_MM_HMX_TILE_SIZE); - uint8_t * restrict src_data = (uint8_t *) src->data + (src_stride * ir_first); - uint8_t * restrict dst_data = (uint8_t *) dst + (dst_stride * ir_first); + size_t scratch0_size, scratch1_size, scratch2_size; + scratch0_size = hex_align_up(n_chunk_n_cols * vec_dot_size, HTP_MM_HMX_TILE_SIZE); // dequant buf 0 + scratch1_size = pipeline ? scratch0_size : 0; // dequant buf 1 + scratch2_size = pipeline ? output_area_size : 0; // output buf 1 - for (uint32_t i = ir_first; i < ir_last; ++i) { - hex_l2fetch(src_data, src_row_size, src_stride, 2); - hvx_copy_f16_f32_au(dst_data, src_data, ne0); + uint8_t *vtcm_ptr = (uint8_t *) ctx->vtcm_base; + __fp16 *vtcm_weight_raw[2] = { NULL, NULL }; + if (weight_area_size) { + if (pipeline) { + vtcm_weight_raw[0] = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, weight_area_size); + vtcm_weight_raw[1] = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, weight_area_size); + } else { + vtcm_weight_raw[0] = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, weight_area_size); + } + } + __fp16 *vtcm_f16_act = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, act_area_size); + float *vtcm_f32_act = (float *) vtcm_seq_alloc(&vtcm_ptr, act_f32_size); + __fp16 *vtcm_output = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, output_area_size); + void *vtcm_scratch0 = vtcm_seq_alloc(&vtcm_ptr, scratch0_size); + void *vtcm_scratch1 = scratch1_size ? vtcm_seq_alloc(&vtcm_ptr, scratch1_size) : NULL; + void *vtcm_scratch2 = scratch2_size ? vtcm_seq_alloc(&vtcm_ptr, scratch2_size) : NULL; + __fp16 *vtcm_scales = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, 256); - dst_data += dst_stride; - src_data += src_stride; + vtcm_used = vtcm_ptr - (uint8_t *) ctx->vtcm_base; + if (vtcm_used > vtcm_budget) { + FARF(ERROR, "hmx-mm-2d-precomputed: VTCM overflow: used %zu budget %zu, m %d k %d n %d mc %zu nc %zu", + vtcm_used, vtcm_budget, m, k, n, m_chunk_n_rows, n_chunk_n_cols); + return -1; } - uint64_t t2 = HAP_perf_get_qtimer_count(); + hmx_init_column_scales(vtcm_scales, Q6_V_vsplat_R(0x3c00)); // scale: 1.0, bias: 0.0 in FP16 - FARF(HIGH, "quantize-f32-f16: %u/%u : n-rows %u (%u:%u) row-size %u (%u) -> %u usec %u\n", ith, nth, nrows, ir_first, - ir_last, src_row_size, src_stride, dst_stride, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); -} + FARF(HIGH, "hmx-mm-2d-precomputed: standard : m %d k %d n %d wtype %d mc %zu nc %zu vtcm %zu/%zu", + m, k, n, weight_type, m_chunk_n_rows, n_chunk_n_cols, vtcm_used, vtcm_budget); -// TODO just a plain copy that should be done via the DMA during the Op setup -static void quantize_f16_f16(unsigned int nth, unsigned int ith, void * data) { - struct htp_matmul_context * mmctx = data; - struct htp_ops_context * octx = mmctx->octx; + int n_chunk_cnt = hmx_ceil_div(n, n_chunk_n_cols); - const struct htp_tensor * src = octx->src[1]; - uint8_t * restrict dst = octx->src1_spad.data; - uint32_t nrows_per_thread = mmctx->src1_nrows_per_thread; - uint32_t dst_stride = octx->src1_spad.stride; + if (pipeline) { + // --- Asynchronous Pipelined Loop --- + hmx_matmul_job_t job_slots[2]; // persistent double-buffered job descriptors + + for (size_t mr = 0; mr < m; mr += m_chunk_n_rows) { + const size_t n_rows = hex_smin(m - mr, m_chunk_n_rows); - uint64_t t1 = HAP_perf_get_qtimer_count(); + void *vtcm_weight_bufs[2] = { vtcm_scratch0, vtcm_scratch1 }; + void *vtcm_output_bufs[2] = { vtcm_output, vtcm_scratch2 }; - const uint32_t ne0 = src->ne[0]; - const uint32_t ne1 = src->ne[1]; - const uint32_t ne2 = src->ne[2]; - const uint32_t ne3 = src->ne[3]; + transfer_activation_chunk_threaded(ctx, vtcm_f16_act, activation + mr * act_stride, n_rows, k, act_stride, act_threads, k_valid, vtcm_f32_act); - const uint32_t nrows = ne1 * ne2 * ne3; // total n_rows + // Prologue: push A0 and optionally A1 (if n_chunk_cnt > 1) + const size_t n_cols_A0 = hex_smin(n - 0 * n_chunk_n_cols, n_chunk_n_cols); + if (is_quant) { + dma_queue_push(ctx->dma[0], dma_make_ptr(vtcm_weight_raw[0], weight), aligned_tile_size, tile_size, tile_size, (n_cols_A0 / 32) * n_k_tiles); + } else { + dma_queue_push(ctx->dma[0], dma_make_ptr(vtcm_weight_raw[0], weight), row_stride, weight_stride, row_stride, n_cols_A0); + } - const uint32_t ir_first = nrows_per_thread * ith; // first row - const uint32_t ir_last = MIN(ir_first + nrows_per_thread, nrows); // last row + if (1 < n_chunk_cnt) { + const size_t n_cols_A1 = hex_smin(n - 1 * n_chunk_n_cols, n_chunk_n_cols); + if (is_quant) { + dma_queue_push(ctx->dma[0], dma_make_ptr(vtcm_weight_raw[1], weight + n_chunk_n_cols * weight_stride), aligned_tile_size, tile_size, tile_size, (n_cols_A1 / 32) * n_k_tiles); + } else { + dma_queue_push(ctx->dma[0], dma_make_ptr(vtcm_weight_raw[1], weight + n_chunk_n_cols * weight_stride), row_stride, weight_stride, row_stride, n_cols_A1); + } + } - const size_t src_row_size = ne0 * sizeof(float); - const size_t src_stride = src->nb[1]; + // pop A0 -> dequantize A0 -> submit C0 + dma_queue_pop(ctx->dma[0]); + dequantize_tiled_weight_chunk_to_fp16_tiles( + ctx, vtcm_weight_bufs[0], vtcm_weight_raw[0], + n_cols_A0, k, row_stride, weight_type, + n_k_tiles, n_k_tiles_div, dequant_worker_fn, n_threads); + + hmx_matmul_job_init(&job_slots[0], (__fp16 *) vtcm_output_bufs[0], (__fp16 *) vtcm_f16_act, + (__fp16 *) vtcm_weight_bufs[0], vtcm_scales, + hmx_ceil_div(n_rows, HTP_MM_HMX_TILE_N_ROWS), + hmx_ceil_div(n_cols_A0, HTP_MM_HMX_TILE_N_COLS), k / HTP_MM_HMX_TILE_N_ROWS); + hmx_queue_push(ctx->hmx_queue, hmx_queue_make_desc(hmx_matmul_worker_fn, &job_slots[0])); + + // Main loop: pop/dequantize A_{i+1} -> push A_{i+2} -> submit C_{i+1} -> wait C_i and store D_i + for (int i = 0; i < n_chunk_cnt; ++i) { + const size_t nc = i * n_chunk_n_cols; + const size_t nc_p1 = nc + 1 * n_chunk_n_cols; + const size_t nc_p2 = nc + 2 * n_chunk_n_cols; + + const size_t n_cols = hex_smin(n - nc, n_chunk_n_cols); + const size_t n_cols_p1 = hex_smin(n - nc_p1, n_chunk_n_cols); + const size_t n_cols_p2 = hex_smin(n - nc_p2, n_chunk_n_cols); + + // 1. pop A_{i+1} and dequantize it (if i+1 < n_chunk_cnt) + if (i + 1 < n_chunk_cnt) { + dma_queue_pop(ctx->dma[0]); + dequantize_tiled_weight_chunk_to_fp16_tiles( + ctx, vtcm_weight_bufs[(i + 1) % 2], vtcm_weight_raw[(i + 1) % 2], + n_cols_p1, k, row_stride, weight_type, + n_k_tiles, n_k_tiles_div, dequant_worker_fn, n_threads); + } - uint8_t * restrict src_data = (uint8_t *) src->data + (src_stride * ir_first); - uint8_t * restrict dst_data = (uint8_t *) dst + (dst_stride * ir_first); + // 2. push A_{i+2} (if i+2 < n_chunk_cnt) + if (i + 2 < n_chunk_cnt) { + if (is_quant) { + dma_queue_push(ctx->dma[0], dma_make_ptr(vtcm_weight_raw[(i + 2) % 2], weight + nc_p2 * weight_stride), aligned_tile_size, tile_size, tile_size, (n_cols_p2 / 32) * n_k_tiles); + } else { + dma_queue_push(ctx->dma[0], dma_make_ptr(vtcm_weight_raw[(i + 2) % 2], weight + nc_p2 * weight_stride), row_stride, weight_stride, row_stride, n_cols_p2); + } + } - for (uint32_t i = ir_first; i < ir_last; ++i) { - hex_l2fetch(src_data, src_row_size, src_stride, 2); - hvx_copy_f16_au(dst_data, src_data, ne0); + // 3. submit C_{i+1} (if i+1 < n_chunk_cnt) + if (i + 1 < n_chunk_cnt) { + hmx_matmul_job_init(&job_slots[(i + 1) % 2], (__fp16 *) vtcm_output_bufs[(i + 1) % 2], + (__fp16 *) vtcm_f16_act, (__fp16 *) vtcm_weight_bufs[(i + 1) % 2], + vtcm_scales, hmx_ceil_div(n_rows, HTP_MM_HMX_TILE_N_ROWS), + hmx_ceil_div(n_cols_p1, HTP_MM_HMX_TILE_N_COLS), k / HTP_MM_HMX_TILE_N_ROWS); + hmx_queue_push(ctx->hmx_queue, hmx_queue_make_desc(hmx_matmul_worker_fn, &job_slots[(i + 1) % 2])); + } - dst_data += dst_stride; - src_data += src_stride; + // 4. wait C_i and store D_i (multi-thread HVX, parallel with C_{i+1}) + hmx_queue_pop(ctx->hmx_queue); + float *output_chunk = dst + (mr * dst_stride + nc); + int chunk_dst_cols = dst_cols - (int)nc; + if (chunk_dst_cols > 0) { + transfer_output_chunk_threaded(ctx, output_chunk, vtcm_output_bufs[i % 2], n_rows, n_cols, dst_stride, chunk_dst_cols, n_threads); + } + } + } + hmx_queue_suspend(ctx->hmx_queue); + } else { + // --- Synchronous Un-pipelined loop (m <= 32 or fallback) --- + HAP_compute_res_hmx_lock(ctx->vtcm_rctx); + for (size_t mr = 0; mr < m; mr += m_chunk_n_rows) { + const size_t n_rows = hex_smin(m - mr, m_chunk_n_rows); + + transfer_activation_chunk_threaded(ctx, vtcm_f16_act, activation + mr * act_stride, n_rows, k, act_stride, act_threads, k_valid, vtcm_f32_act); + + for (size_t nc = 0; nc < n; nc += n_chunk_n_cols) { + const size_t n_cols = hex_smin(n - nc, n_chunk_n_cols); + const size_t n_row_tiles = hmx_ceil_div(n_rows, HTP_MM_HMX_TILE_N_ROWS); + const size_t n_col_tiles = hmx_ceil_div(n_cols, HTP_MM_HMX_TILE_N_COLS); + + // A: Weight DMA (Synchronous) + if (is_quant) { + dma_queue_push(ctx->dma[0], dma_make_ptr(vtcm_weight_raw[0], weight + nc * weight_stride), aligned_tile_size, tile_size, tile_size, (n_cols / 32) * n_k_tiles); + } else { + dma_queue_push(ctx->dma[0], dma_make_ptr(vtcm_weight_raw[0], weight + nc * weight_stride), row_stride, weight_stride, row_stride, n_cols); + } + dma_queue_pop(ctx->dma[0]); + + // B: Weight Dequantize (Threaded) + dequantize_tiled_weight_chunk_to_fp16_tiles( + ctx, vtcm_scratch0, vtcm_weight_raw[0], + n_cols, k, row_stride, weight_type, + n_k_tiles, n_k_tiles_div, dequant_worker_fn, n_threads); + + // C: HMX Compute (Synchronous) + core_dot_chunk_fp16(vtcm_output, vtcm_f16_act, vtcm_scratch0, vtcm_scales, n_row_tiles, n_col_tiles, k / HTP_MM_HMX_TILE_N_ROWS); + + // D: Output Store + float *output_chunk = dst + (mr * dst_stride + nc); + int chunk_dst_cols = dst_cols - (int)nc; + if (chunk_dst_cols > 0) { + transfer_output_chunk_threaded(ctx, output_chunk, vtcm_output, n_rows, n_cols, dst_stride, chunk_dst_cols, n_threads); + } + } + } + HAP_compute_res_hmx_unlock(ctx->vtcm_rctx); } - uint64_t t2 = HAP_perf_get_qtimer_count(); - - FARF(HIGH, "quantize-f16-f16: %u/%u : n-rows %u (%u:%u) row-size %u (%u) -> %u usec %u\n", ith, nth, nrows, ir_first, - ir_last, src_row_size, src_stride, dst_stride, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); + return 0; } - -static inline bool htp_is_permuted(const struct htp_tensor * t) { - return t->nb[0] > t->nb[1] || t->nb[1] > t->nb[2] || t->nb[2] > t->nb[3]; +static inline int hmx_mm_batch_r2(const hmx_mm_f16_f32_batched_params_t *params) { + return params->ne02 > 0 ? params->ne12 / params->ne02 : 1; } -static int htp_mminit_vec_dot(struct htp_matmul_context * mmctx, enum htp_data_type type) { - switch (type) { - case HTP_TYPE_Q4_0: - mmctx->type = "q4x4x2-f32"; - mmctx->vec_dot_1x1 = vec_dot_q4x4x2_q8x4x2_1x1; - mmctx->vec_dot_2x1 = vec_dot_q4x4x2_q8x4x2_2x1; - mmctx->vec_dot_2x2 = vec_dot_q4x4x2_q8x4x2_2x2; - mmctx->vec_dot_4x1 = vec_dot_q4x4x2_q8x4x2_4x1; - return 0; - case HTP_TYPE_Q4_1: - mmctx->type = "q4_1x4x2-f32"; - mmctx->vec_dot_1x1 = vec_dot_q4_1x4x2_q8x4x2_1x1; - mmctx->vec_dot_2x1 = vec_dot_q4_1x4x2_q8x4x2_2x1; - mmctx->vec_dot_2x2 = vec_dot_q4_1x4x2_q8x4x2_2x2; - mmctx->vec_dot_4x1 = vec_dot_q4_1x4x2_q8x4x2_4x1; - return 0; - case HTP_TYPE_Q8_0: - mmctx->type = "q8x4x2-f32"; - mmctx->vec_dot_1x1 = vec_dot_q8x4x2_q8x4x2_1x1; - mmctx->vec_dot_2x1 = vec_dot_q8x4x2_q8x4x2_2x1; - mmctx->vec_dot_2x2 = vec_dot_q8x4x2_q8x4x2_2x2; - mmctx->vec_dot_4x1 = vec_dot_q8x4x2_q8x4x2_4x1; - return 0; - case HTP_TYPE_IQ4_NL: - mmctx->type = "iq4nlx4x2-f32"; - mmctx->vec_dot_1x1 = vec_dot_iq4nlx4x2_q8x4x2_1x1; - mmctx->vec_dot_2x1 = vec_dot_iq4nlx4x2_q8x4x2_2x1; - mmctx->vec_dot_2x2 = vec_dot_iq4nlx4x2_q8x4x2_2x2; - mmctx->vec_dot_4x1 = vec_dot_iq4nlx4x2_q8x4x2_4x1; - return 0; - case HTP_TYPE_MXFP4: - mmctx->type = "mxfp4x4x2-f32"; - mmctx->vec_dot_1x1 = vec_dot_mxfp4x4x2_q8x4x2_1x1; - mmctx->vec_dot_2x1 = vec_dot_mxfp4x4x2_q8x4x2_2x1; - mmctx->vec_dot_2x2 = vec_dot_mxfp4x4x2_q8x4x2_2x2; - mmctx->vec_dot_4x1 = vec_dot_mxfp4x4x2_q8x4x2_4x1; - return 0; - default: - return -1; - } +static inline int hmx_mm_batch_r3(const hmx_mm_f16_f32_batched_params_t *params) { + return params->ne03 > 0 ? params->ne13 / params->ne03 : 1; } -static void htp_mminit_spad(struct htp_ops_context * octx, - size_t dst_row_size, - size_t src0_row_size_padded, - size_t src1_row_size, - uint32_t src1_nrows, - size_t src2_spad_size_per_thread) { - octx->dst_spad.size_per_thread = hex_round_up(MM_SPAD_DST_NROWS * dst_row_size, 256); - octx->src0_spad.size_per_thread = hex_round_up(MM_SPAD_SRC0_NROWS * src0_row_size_padded, 256); - octx->src1_spad.size_per_thread = hex_round_up(src1_row_size * src1_nrows, 256); - - if (src2_spad_size_per_thread > 0) { - octx->src2_spad.size_per_thread = src2_spad_size_per_thread; - octx->src2_spad.size = octx->src2_spad.size_per_thread; - } +static inline const __fp16 *hmx_mm_weight_batch_ptr(const hmx_mm_f16_f32_batched_params_t *params, + int dst_b2, int dst_b3) { + const int r2 = hmx_mm_batch_r2(params); + const int r3 = hmx_mm_batch_r3(params); + return (const __fp16 *) ((const uint8_t *) params->weight + + (size_t) (dst_b2 / r2) * params->src0_nb2 + + (size_t) (dst_b3 / r3) * params->src0_nb3); +} - // src0 spad is also used in dynamic quantizer to store padded src1 rows - size_t src1_row_size_padded = hex_round_up(src1_row_size, QK_Q8_0x4x2 * sizeof(float)); - if (octx->src0_spad.size_per_thread < src1_row_size_padded) { - octx->src0_spad.size_per_thread = src1_row_size_padded; - } +static inline const float *hmx_mm_activation_batch_ptr(const hmx_mm_f16_f32_batched_params_t *params, + int dst_b2, int dst_b3) { + return (const float *) ((const uint8_t *) params->activation + + (size_t) dst_b2 * params->src1_nb2 + + (size_t) dst_b3 * params->src1_nb3); +} - octx->src1_spad.size = octx->src1_spad.size_per_thread; - octx->src0_spad.size = octx->src0_spad.size_per_thread * octx->n_threads; - octx->dst_spad.size = octx->dst_spad.size_per_thread * octx->n_threads; +static inline float *hmx_mm_dst_batch_ptr(const hmx_mm_f16_f32_batched_params_t *params, + int dst_b2, int dst_b3) { + return (float *) ((uint8_t *) params->dst + + (size_t) dst_b2 * params->dst_nb2 + + (size_t) dst_b3 * params->dst_nb3); } -static int op_matmul_hvx(struct htp_ops_context * octx) { - htp_matmul_tensors_preamble; +static int hmx_mm_f16_f32_batched_simple(struct htp_context *ctx, + const hmx_mm_f16_f32_batched_params_t *params, + int m_chunk, int n_chunk, int pipeline, int n_threads, int act_threads, int vtcm_size) { + int ret = 0; + for (int b3 = 0; b3 < params->ne13 && ret == 0; ++b3) { + for (int b2 = 0; b2 < params->ne12 && ret == 0; ++b2) { + ret = hmx_mm_2d_f32(ctx, hmx_mm_dst_batch_ptr(params, b2, b3), + hmx_mm_activation_batch_ptr(params, b2, b3), + (const uint8_t *)hmx_mm_weight_batch_ptr(params, b2, b3), + params->m, params->k, params->n, + params->act_stride, params->weight_stride * (int)sizeof(__fp16), + HTP_TYPE_F16, params->k, params->n, params->n, + m_chunk, n_chunk, pipeline, n_threads, act_threads, + 0, 0, vtcm_size); + } + } + return ret; +} + +static int hmx_mm_f16_f32_batched(struct htp_context *ctx, const hmx_mm_f16_f32_batched_params_t *params, + int m_chunk, int n_chunk, int pipeline, int n_threads, int act_threads, int vtcm_size) { + if (params->act_stride < params->k || params->weight_stride < params->k || params->dst_stride < params->n) { return -1; } + if (params->ne02 <= 0 || params->ne03 <= 0 || params->ne12 <= 0 || params->ne13 <= 0) { return -1; } + if (params->ne12 % params->ne02 != 0 || params->ne13 % params->ne03 != 0) { return -1; } + if (params->k % 32 != 0 || params->n % 32 != 0) { return -1; } + if (!hex_is_aligned(params->dst, VLEN) || !hex_is_aligned(params->activation, VLEN)) { return -1; } + + const int group_size = hmx_mm_batch_r2(params); + const size_t vtcm_budget = ctx->vtcm_size; + + // Check if the precomputed parameters are grouped or simple. + // If simple, or if group_size <= 1, we use simple fallback loop. + // Grouped path is only valid if group_size > 1 and it fits within VTCM budget. + bool run_grouped = (group_size > 1 && (size_t)vtcm_size <= vtcm_budget); + if (!run_grouped) { + return hmx_mm_f16_f32_batched_simple(ctx, params, m_chunk, n_chunk, pipeline, n_threads, act_threads, vtcm_size); + } + + const size_t vec_dot_size = params->k * sizeof(__fp16); + + const bool use_dma_activation = (params->act_stride > params->k); + const size_t f32_scratch_size = use_dma_activation + ? hex_align_up((size_t)act_threads * HTP_MM_DMA_ACT_MULTIPLIER * (size_t) params->k * sizeof(float), HTP_MM_HMX_TILE_SIZE) : 0; + + size_t m_chunk_n_rows = m_chunk; + size_t n_chunk_n_cols = n_chunk; + size_t vtcm_used = vtcm_size; + + const size_t act_head_stride = m_chunk_n_rows * (size_t) params->k; // fp16 elements between heads + const size_t weight_area_size = hex_align_up(n_chunk_n_cols * vec_dot_size, HTP_MM_HMX_TILE_SIZE); + const size_t activation_area_size = hex_align_up(group_size * m_chunk_n_rows * vec_dot_size, HTP_MM_HMX_TILE_SIZE); + const size_t output_area_size = hex_align_up(m_chunk_n_rows * n_chunk_n_cols * sizeof(__fp16), HTP_MM_HMX_TILE_SIZE); + const size_t scratch_area_size = hex_align_up(n_chunk_n_cols * vec_dot_size, HTP_MM_HMX_TILE_SIZE); + + uint8_t *vtcm_ptr = (uint8_t *) ctx->vtcm_base; + __fp16 *vtcm_weight = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, weight_area_size); + __fp16 *vtcm_f16_act = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, activation_area_size); + __fp16 *vtcm_output = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, output_area_size); + void *vtcm_scratch0 = vtcm_seq_alloc(&vtcm_ptr, scratch_area_size); + void *vtcm_scratch1 = vtcm_seq_alloc(&vtcm_ptr, scratch_area_size); + __fp16 *vtcm_scales = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, 256); + float *vtcm_f32_act = use_dma_activation ? (float *) vtcm_seq_alloc(&vtcm_ptr, f32_scratch_size) : NULL; + + if ((size_t) (vtcm_ptr - (uint8_t *) ctx->vtcm_base) > vtcm_budget) { + FARF(HIGH, "%s: grouped layout overflowed VTCM, falling back to simple batched loop", __func__); + return hmx_mm_f16_f32_batched_simple(ctx, params, m_chunk, n_chunk, pipeline, n_threads, act_threads, vtcm_size); + } + + hmx_init_column_scales(vtcm_scales, Q6_V_vsplat_R(0x3c00)); // scale: 1.0, bias: 0.0 in FP16 + + FARF(HIGH, "%s: grouped path m=%d k=%d n=%d group=%d streams=%d mc=%zu nc=%zu vtcm=%zu/%zu", + __func__, params->m, params->k, params->n, group_size, params->ne13, + m_chunk_n_rows, n_chunk_n_cols, + (size_t) (vtcm_ptr - (uint8_t *) ctx->vtcm_base), vtcm_budget); + + const size_t fp16_row_bytes = (size_t) params->k * sizeof(__fp16); + const size_t weight_row_bytes = (size_t) params->weight_stride * sizeof(__fp16); + + HAP_compute_res_hmx_lock(ctx->vtcm_rctx); + + for (int b3 = 0; b3 < params->ne13; ++b3) { + for (int b2_base = 0; b2_base < params->ne12; b2_base += group_size) { + const __fp16 *weight_group = hmx_mm_weight_batch_ptr(params, b2_base, b3); + + for (size_t mr = 0; mr < (size_t) params->m; mr += m_chunk_n_rows) { + const size_t n_rows = hex_smin((size_t) params->m - mr, m_chunk_n_rows); + const size_t n_row_tiles = hmx_ceil_div((int) n_rows, HTP_MM_HMX_TILE_N_ROWS); + + // Pre-load activations for all heads in the group (once per m_chunk). + // When the source is strided (permuted Q), use 2D DMA to gather + // contiguous rows into a VTCM scratch buffer first, then HVX + // converts from the contiguous VTCM buffer. This avoids L2 cache + // thrashing from HVX loads at large strides. + for (int g = 0; g < group_size; ++g) { + const float *activation_chunk = hmx_mm_activation_batch_ptr(params, b2_base + g, b3) + mr * params->act_stride; + __fp16 *vtcm_act_g = vtcm_f16_act + (size_t) g * act_head_stride; + if (use_dma_activation) { + transfer_activation_chunk_threaded(ctx, vtcm_act_g, + activation_chunk, (int) n_rows, + params->k, params->act_stride, act_threads, params->k, vtcm_f32_act); + } else { + transfer_activation_chunk_threaded(ctx, vtcm_act_g, + activation_chunk, (int) n_rows, + params->k, params->act_stride, act_threads, params->k, NULL); + } + } - struct htp_matmul_context mmctx_struct = {0}; - struct htp_matmul_context * mmctx = &mmctx_struct; - mmctx->octx = octx; + void *buf_curr = vtcm_scratch0; + void *buf_next = vtcm_scratch1; - const uint32_t src0_nrows = ne01 * ne02 * ne03; - const uint32_t src1_nrows = ne11 * ne12 * ne13; + { + const size_t n_cols_first = hex_smin((size_t) params->n, n_chunk_n_cols); + dma_queue_push(ctx->dma[0], dma_make_ptr(buf_curr, weight_group), + fp16_row_bytes, weight_row_bytes, fp16_row_bytes, n_cols_first); + } - // Compute src0_nrows_per_thread - mmctx->src0_nrows_per_thread = (src0_nrows + octx->n_threads - 1) / octx->n_threads; - mmctx->src0_nrows_per_thread += (mmctx->src0_nrows_per_thread & 1); // round up to even + for (size_t nc = 0; nc < (size_t) params->n; nc += n_chunk_n_cols) { + const size_t n_cols = hex_smin((size_t) params->n - nc, n_chunk_n_cols); + const size_t n_col_tiles = hmx_ceil_div((int) n_cols, HTP_MM_HMX_TILE_N_COLS); + + { + dma_queue_pop(ctx->dma[0]); + + const size_t nc_next = nc + n_chunk_n_cols; + if (nc_next < (size_t) params->n) { + const size_t n_cols_next = hex_smin((size_t) params->n - nc_next, n_chunk_n_cols); + const __fp16 *next_weight_chunk = weight_group + nc_next * params->weight_stride; + + dma_queue_push(ctx->dma[0], dma_make_ptr(buf_next, next_weight_chunk), + fp16_row_bytes, weight_row_bytes, fp16_row_bytes, n_cols_next); + } + + hmx_interleave_rows_to_tiles(vtcm_weight, (const __fp16 *) buf_curr, n_cols, params->k, params->k, 0, n_cols); + hex_swap_ptr(&buf_curr, &buf_next); + } + + // Reuse the interleaved weight for every q_head in this GQA group + for (int g = 0; g < group_size; ++g) { + struct htp_thread_trace * tr = &ctx->trace[HTP_MAX_NTHREADS]; + htp_trace_event_start(tr, HTP_TRACE_EVT_HMX_COMP, g); + { + const __fp16 * vtcm_act_g = vtcm_f16_act + (size_t) g * act_head_stride; + core_dot_chunk_fp16(vtcm_output, vtcm_act_g, vtcm_weight, vtcm_scales, n_row_tiles, n_col_tiles, + params->k / 32); + } + htp_trace_event_stop(tr, HTP_TRACE_EVT_HMX_COMP, g); + + { + float *output = hmx_mm_dst_batch_ptr(params, b2_base + g, b3) + mr * params->dst_stride + nc; + int chunk_dst_cols = params->n - (int)nc; + if (chunk_dst_cols > 0) { + transfer_output_chunk_threaded(ctx, output, vtcm_output, (int) n_rows, (int) n_cols, params->dst_stride, chunk_dst_cols, ctx->n_threads); + } + } + } + } + } + } + } - const size_t src0_row_size = nb01; - const size_t dst_row_size = nb1; - size_t src1_row_size = nb11; + HAP_compute_res_hmx_unlock(ctx->vtcm_rctx); - const size_t src0_row_size_padded = hex_round_up(src0_row_size, 128); - size_t src1_row_size_padded; + return 0; +} - worker_callback_t quant_job_func; - worker_callback_t matmul_job_func = src1_nrows > 1 ? matmul_2d : matvec_2d; +static void transfer_activation_chunk_gathered_threaded( + struct htp_context *ctx, + __fp16 *dst, + const float *src, + int start_row, + int n_rows, + int k_block, + const struct mmid_row_mapping *matrix_rows, + int cur_a, + int mapping_stride, + int ne11, + size_t nb11, + size_t nb12, + int cne1, + int n_threads, + int k_valid) { + if (n_rows <= 0) return; + int chunks_per_thread = hmx_ceil_div(n_rows, n_threads); + chunks_per_thread = hex_align_up(chunks_per_thread, 2); + + int actual_threads = hmx_ceil_div(n_rows, chunks_per_thread); + + activation_transfer_gathered_task_state_t state = { + .dst = dst, + .src = src, + .n_tasks = actual_threads, + .n_tot_chunks = n_rows, + .n_chunks_per_task = chunks_per_thread, + .k_block = k_block, + .matrix_rows = matrix_rows, + .cur_a = cur_a, + .mapping_stride = mapping_stride, + .ne11 = ne11, + .ne11_div = ne11 > 1 ? init_fastdiv_values(ne11) : (struct fastdiv_values){0, 0}, + .nb11 = nb11, + .nb12 = nb12, + .start_row = start_row, + .cne1 = cne1, + .k_valid = k_valid, + .traces = ctx->trace, + }; + + worker_callback_t worker_fn = ne11 == 1 ? transfer_activation_chunk_gathered_worker_flat_fn : + transfer_activation_chunk_gathered_worker_fn; + + if (actual_threads <= 1) { + worker_fn(1, 0, &state); + } else { + worker_pool_run_func(ctx->worker_pool, worker_fn, &state, actual_threads); + } +} + +static void transfer_output_chunk_scattered_threaded( + struct htp_context *ctx, + float *dst, + const __fp16 *vtcm_src, + int start_row, + int n_rows, + int n_cols, + const struct mmid_row_mapping *matrix_rows, + int cur_a, + int mapping_stride, + size_t dst_nb1, + size_t dst_nb2, + int cne1, + int n_threads) { + if (n_rows <= 0) return; + int chunks_per_thread = hmx_ceil_div(n_rows, n_threads); + chunks_per_thread = hex_align_up(chunks_per_thread, 2); + + int actual_threads = hmx_ceil_div(n_rows, chunks_per_thread); + + output_transfer_scattered_task_state_t state = { + .vtcm_src = vtcm_src, + .dst = dst, + .n_tasks = actual_threads, + .n_tot_chunks = n_rows, + .n_chunks_per_task = chunks_per_thread, + .n_cols = n_cols, + .matrix_rows = matrix_rows, + .cur_a = cur_a, + .mapping_stride = mapping_stride, + .dst_nb1 = dst_nb1, + .dst_nb2 = dst_nb2, + .start_row = start_row, + .cne1 = cne1, + .traces = ctx->trace, + }; + + if (actual_threads <= 1) { + transfer_output_chunk_scattered_worker_fn(1, 0, &state); + } else { + worker_pool_run_func(ctx->worker_pool, transfer_output_chunk_scattered_worker_fn, &state, actual_threads); + } +} + +static int hmx_mm_id_2d_f32(struct htp_context *ctx, + float *restrict dst, + const float *activation, + const uint8_t *weight, + int m, int k, int n, + int k_valid, + int ne11, + size_t act_nb1, size_t act_nb2, + size_t dst_nb1, size_t dst_nb2, + int weight_stride, + int weight_type, + const struct mmid_row_mapping *matrix_rows, + int cur_a, + int mapping_stride) { + const int cne1 = m; + const int m_padded = hex_align_up(m, 32); + + if (k % 32 != 0 || n % 32 != 0) { return -1; } + if (!hex_is_aligned(dst, VLEN) || !hex_is_aligned(activation, VLEN)) { return -1; } + + size_t row_stride = htp_mm_get_tiled_row_stride(weight_type, k); + if (row_stride == 0) { + return -1; + } + + worker_callback_t dequant_worker_fn = NULL; + switch (weight_type) { + case HTP_TYPE_Q4_0: dequant_worker_fn = dequantize_tiled_worker_loop_q4_0; break; + case HTP_TYPE_IQ4_NL: dequant_worker_fn = dequantize_tiled_worker_loop_iq4_nl; break; + case HTP_TYPE_Q4_1: dequant_worker_fn = dequantize_tiled_worker_loop_q4_1; break; + case HTP_TYPE_MXFP4: dequant_worker_fn = dequantize_tiled_worker_loop_mxfp4; break; + case HTP_TYPE_Q8_0: dequant_worker_fn = dequantize_tiled_worker_loop_q8_0; break; + case HTP_TYPE_F16: dequant_worker_fn = convert_f16_worker_loop; break; + case HTP_TYPE_F32: dequant_worker_fn = quantize_f32_worker_loop; break; + default: + return -1; + } - bool need_quant = true; + const int n_k_tiles = k / HTP_MM_HMX_TILE_N_COLS; + const struct fastdiv_values n_k_tiles_div = init_fastdiv_values(n_k_tiles); - if (src0->type == HTP_TYPE_F16) { - // Try optimized f16-f16 path first (src1 in VTCM) - const size_t f16_src1_row_size = hex_round_up(ne10 * 2, 128); - const size_t f16_src1_spad_size = hex_round_up(f16_src1_row_size * src1_nrows, 256); - const size_t f16_src0_spad_size = hex_round_up(MM_SPAD_SRC0_NROWS * src0_row_size_padded, 256) * octx->n_threads; - const size_t f16_dst_spad_size = hex_round_up(MM_SPAD_DST_NROWS * dst_row_size, 256) * octx->n_threads; - - const size_t f16_total_size = f16_src1_spad_size + f16_src0_spad_size + f16_dst_spad_size; - - // Default matmul implementation does not support multi-batch src0 (N-vs-N broadcasting). - // It only supports 1-vs-N broadcasting (src0 is 2D) or standard 2D matmul. - const bool is_batched = (ne02 > 1) || (ne03 > 1); - const bool is_permuted = htp_is_permuted(octx->src[0]) || htp_is_permuted(octx->src[1]); - - if (!is_batched && !is_permuted && f16_total_size <= octx->ctx->vtcm_size) { - // Optimized path - quant_job_func = (src1->type == HTP_TYPE_F32) ? quantize_f32_f16 : quantize_f16_f16; - mmctx->type = "f16-f16"; - mmctx->vec_dot_1x1 = vec_dot_f16_f16_aa_1x1; - mmctx->vec_dot_2x1 = vec_dot_f16_f16_aa_2x1; - mmctx->vec_dot_2x2 = vec_dot_f16_f16_aa_2x2; - - src1_row_size = f16_src1_row_size; // row size post quantization - - octx->dst_spad.size_per_thread = hex_round_up(MM_SPAD_DST_NROWS * dst_row_size, 256); - octx->src0_spad.size_per_thread = hex_round_up(MM_SPAD_SRC0_NROWS * src0_row_size_padded, 256); - octx->src1_spad.size_per_thread = hex_round_up(src1_row_size * src1_nrows, 256); - - octx->src1_spad.size = octx->src1_spad.size_per_thread; - octx->src0_spad.size = octx->src0_spad.size_per_thread * octx->n_threads; - octx->dst_spad.size = octx->dst_spad.size_per_thread * octx->n_threads; - } else { - // Fallback to f16/f32 (DDR) if src1 doesn't fit in VTCM or broadcasting is required - quant_job_func = NULL; - if (src1->type == HTP_TYPE_F32) { - mmctx->type = "f16-f32"; - mmctx->vec_dot_1x1 = vec_dot_f16_f32_uu_1x1; - matmul_job_func = matmul_4d; - } else { - mmctx->type = "f16-f16"; - mmctx->vec_dot_1x1 = vec_dot_f16_f16_uu_1x1; - matmul_job_func = matmul_4d; - } + const int n_threads = ctx->n_threads; + const bool is_quant = (weight_type != HTP_TYPE_F16 && weight_type != HTP_TYPE_F32); - src1_row_size = nb11; // original row size in DDR + const size_t vec_dot_size = k * sizeof(__fp16); + const size_t vtcm_budget = ctx->vtcm_size; + size_t vtcm_used = 0; - octx->dst_spad.size_per_thread = hex_round_up(MM_SPAD_DST_NROWS * dst_row_size, 256); - octx->src0_spad.size_per_thread = hex_round_up(MM_SPAD_SRC0_NROWS * src0_row_size, 256); - octx->src1_spad.size_per_thread = hex_round_up(MM_SPAD_SRC1_NROWS * src1_row_size, 256); + int tile_size = htp_mm_get_weight_tile_size(weight_type); + int aligned_tile_size = htp_mm_get_weight_aligned_tile_size(weight_type); - octx->src0_spad.size = octx->src0_spad.size_per_thread * octx->n_threads; - octx->src1_spad.size = octx->src1_spad.size_per_thread * octx->n_threads; - octx->dst_spad.size = octx->dst_spad.size_per_thread * octx->n_threads; + const size_t qweight_row_stride = is_quant ? (size_t)(n_k_tiles * aligned_tile_size) / 32 : 0; + const size_t weight_row_stride = is_quant ? qweight_row_stride : row_stride; - // Init fastdiv for matmul_4d (supports broadcasting) - mmctx->mm_div_ne12_ne1 = init_fastdiv_values(src1->ne[2] * dst->ne[1]); - mmctx->mm_div_ne1 = init_fastdiv_values(dst->ne[1]); - mmctx->mm_div_r2 = init_fastdiv_values(src1->ne[2] / src0->ne[2]); - mmctx->mm_div_r3 = init_fastdiv_values(src1->ne[3] / src0->ne[3]); + size_t size_per_n = 0, size_per_m = 0, size_per_mn = 0; + htp_mm_hmx_get_2d_chunk_costs(weight_type, k, /*pipeline=*/false, aligned_tile_size, + &size_per_n, &size_per_m, &size_per_mn); - need_quant = false; - } - } else if (src0->type == HTP_TYPE_F32) { - // Try optimized f32-f32 path first (src1 in VTCM) - const size_t f32_src1_row_size = hex_round_up(ne10 * 4, 128); - const size_t f32_src1_spad_size = hex_round_up(f32_src1_row_size * src1_nrows, 256); - const size_t f32_src0_spad_size = hex_round_up(MM_SPAD_SRC0_NROWS * src0_row_size_padded, 256) * octx->n_threads; - const size_t f32_dst_spad_size = hex_round_up(MM_SPAD_DST_NROWS * dst_row_size, 256) * octx->n_threads; - - const size_t f32_total_size = f32_src1_spad_size + f32_src0_spad_size + f32_dst_spad_size; - - const bool is_batched = (ne02 > 1) || (ne03 > 1); - const bool is_permuted = htp_is_permuted(octx->src[0]) || htp_is_permuted(octx->src[1]); - - if (!is_batched && !is_permuted && f32_total_size <= octx->ctx->vtcm_size) { - // Optimized path - quant_job_func = quantize_f32_f32; - mmctx->type = "f32-f32"; - mmctx->vec_dot_1x1 = vec_dot_f32_f32_aa_1x1; - mmctx->vec_dot_2x1 = vec_dot_f32_f32_aa_2x1; - mmctx->vec_dot_2x2 = vec_dot_f32_f32_aa_2x2; - - src1_row_size = f32_src1_row_size; - - octx->dst_spad.size_per_thread = hex_round_up(MM_SPAD_DST_NROWS * dst_row_size, 256); - octx->src0_spad.size_per_thread = hex_round_up(MM_SPAD_SRC0_NROWS * src0_row_size_padded, 256); - octx->src1_spad.size_per_thread = hex_round_up(src1_row_size * src1_nrows, 256); - - octx->src1_spad.size = octx->src1_spad.size_per_thread; - octx->src0_spad.size = octx->src0_spad.size_per_thread * octx->n_threads; - octx->dst_spad.size = octx->dst_spad.size_per_thread * octx->n_threads; - } else { - // Fallback to DDR / broadcasting - quant_job_func = NULL; - mmctx->type = "f32-f32"; - mmctx->vec_dot_1x1 = vec_dot_f32_f32_uu_1x1; - matmul_job_func = matmul_4d; + size_t m_chunk_n_rows = 0, n_chunk_n_cols = 0; + if (htp_mm_hmx_compute_chunks(vtcm_budget, /*overhead=*/256, size_per_n, size_per_m, size_per_mn, + m_padded, n, + /*m_block_cost=*/(size_t) n * HTP_MM_HMX_COST_W_DEQUANT, + /*n_block_cost=*/(size_t) m_padded * HTP_MM_HMX_COST_A_CONVERT, &m_chunk_n_rows, &n_chunk_n_cols, &vtcm_used)) { + FARF(ERROR, "hmx-mm-id-2d: VTCM too small : m %d k %d n %d budget %zu", m_padded, k, n, vtcm_budget); + return -1; + } - src1_row_size = nb11; + const size_t weight_area_size = hex_align_up(n_chunk_n_cols * weight_row_stride, HTP_MM_HMX_TILE_SIZE); + const size_t act_area_size = hex_align_up(m_chunk_n_rows * vec_dot_size, HTP_MM_HMX_TILE_SIZE); + const size_t output_area_size = hex_align_up(m_chunk_n_rows * n_chunk_n_cols * sizeof(__fp16), HTP_MM_HMX_TILE_SIZE); - octx->dst_spad.size_per_thread = hex_round_up(MM_SPAD_DST_NROWS * dst_row_size, 256); - octx->src0_spad.size_per_thread = hex_round_up(MM_SPAD_SRC0_NROWS * src0_row_size, 256); - octx->src1_spad.size_per_thread = hex_round_up(MM_SPAD_SRC1_NROWS * src1_row_size, 256); + size_t scratch0_size = hex_align_up(n_chunk_n_cols * vec_dot_size, HTP_MM_HMX_TILE_SIZE); - octx->src0_spad.size = octx->src0_spad.size_per_thread * octx->n_threads; - octx->src1_spad.size = octx->src1_spad.size_per_thread * octx->n_threads; - octx->dst_spad.size = octx->dst_spad.size_per_thread * octx->n_threads; + uint8_t *vtcm_ptr = (uint8_t *) ctx->vtcm_base; + __fp16 *vtcm_weight = weight_area_size ? (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, weight_area_size) : NULL; + __fp16 *vtcm_f16_act = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, act_area_size); + __fp16 *vtcm_output = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, output_area_size); + void *vtcm_scratch0 = vtcm_seq_alloc(&vtcm_ptr, scratch0_size); + __fp16 *vtcm_scales = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, 256); - // Init fastdiv for matmul_4d (supports broadcasting) - mmctx->mm_div_ne12_ne1 = init_fastdiv_values(src1->ne[2] * dst->ne[1]); - mmctx->mm_div_ne1 = init_fastdiv_values(dst->ne[1]); - mmctx->mm_div_r2 = init_fastdiv_values(src1->ne[2] / src0->ne[2]); - mmctx->mm_div_r3 = init_fastdiv_values(src1->ne[3] / src0->ne[3]); + vtcm_used = vtcm_ptr - (uint8_t *) ctx->vtcm_base; + if (vtcm_used > vtcm_budget) { + FARF(ERROR, "hmx-mm-id-2d: VTCM overflow: used %zu budget %zu", vtcm_used, vtcm_budget); + return -1; + } - need_quant = false; - } - } else { - if (htp_mminit_vec_dot(mmctx, src0->type) != 0) { - return HTP_STATUS_NO_SUPPORT; - } + hmx_init_column_scales(vtcm_scales, Q6_V_vsplat_R(0x3c00)); - if (src0->type == HTP_TYPE_Q4_1) { - quant_job_func = quantize_f32_q8_1x4x2; - src1_row_size = q8_1x4x2_row_size(ne10); - } else { - quant_job_func = quantize_f32_q8x4x2; - src1_row_size = q8x4x2_row_size(ne10); - } - htp_mminit_spad(octx, dst_row_size, src0_row_size_padded, src1_row_size, src1_nrows, 0); - } + HAP_compute_res_hmx_lock(ctx->vtcm_rctx); - // VTCM scratchpads for all tensors - size_t spad_size = octx->src1_spad.size + octx->src0_spad.size + octx->dst_spad.size; + for (size_t mr = 0; mr < (size_t) m_padded; mr += m_chunk_n_rows) { + const size_t n_rows = hex_smin(m_padded - mr, m_chunk_n_rows); + const size_t n_row_tiles = hmx_ceil_div(n_rows, HTP_MM_HMX_TILE_N_ROWS); - FARF(HIGH, "matmul-%s : src0-spad-size %u src1-spad-size %u dst-spad-size %u (%zu)\n", mmctx->type, - octx->src0_spad.size, octx->src1_spad.size, octx->dst_spad.size, spad_size); + transfer_activation_chunk_gathered_threaded( + ctx, vtcm_f16_act, activation, (int) mr, (int) n_rows, k, + matrix_rows, cur_a, mapping_stride, ne11, act_nb1, act_nb2, cne1, n_threads, k_valid); - FARF(HIGH, "matmul-%s : %ux%ux%ux%u * %ux%ux%ux%u-> %ux%ux%ux%u (0x%p, 0x%p, 0x%p)\n", mmctx->type, src0->ne[0], - src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], dst->ne[0], - dst->ne[1], dst->ne[2], dst->ne[3], src0->data, src1->data, dst->data); + for (size_t nc = 0; nc < (size_t) n; nc += n_chunk_n_cols) { + const size_t n_cols = hex_smin((size_t) n - nc, n_chunk_n_cols); + const size_t n_col_tiles = hmx_ceil_div(n_cols, HTP_MM_HMX_TILE_N_COLS); - // Make sure the reserved vtcm size is sufficient - if (octx->ctx->vtcm_size < spad_size) { - FARF(ERROR, "matmul-%s : current VTCM reservation %zu is too small, needed %zu\n", mmctx->type, - octx->ctx->vtcm_size, spad_size); - return HTP_STATUS_VTCM_TOO_SMALL; + if (is_quant) { + dma_queue_push(ctx->dma[0], dma_make_ptr(vtcm_weight, weight + nc * weight_stride), aligned_tile_size, tile_size, tile_size, (n_cols / 32) * n_k_tiles); + } else { + dma_queue_push(ctx->dma[0], dma_make_ptr(vtcm_weight, weight + nc * weight_stride), row_stride, weight_stride, row_stride, n_cols); + } + dma_queue_pop(ctx->dma[0]); + + dequantize_tiled_weight_chunk_to_fp16_tiles( + ctx, vtcm_scratch0, vtcm_weight, + n_cols, k, row_stride, weight_type, + n_k_tiles, n_k_tiles_div, dequant_worker_fn, n_threads + ); + + struct htp_thread_trace * tr = &ctx->trace[HTP_MAX_NTHREADS]; + htp_trace_event_start(tr, HTP_TRACE_EVT_HMX_COMP, nc); + core_dot_chunk_fp16(vtcm_output, vtcm_f16_act, vtcm_scratch0, vtcm_scales, n_row_tiles, n_col_tiles, k / HTP_MM_HMX_TILE_N_ROWS); + htp_trace_event_stop(tr, HTP_TRACE_EVT_HMX_COMP, nc); + + transfer_output_chunk_scattered_threaded( + ctx, dst + nc, vtcm_output, (int) mr, (int) n_rows, (int) n_cols, + matrix_rows, cur_a, mapping_stride, dst_nb1, dst_nb2, cne1, n_threads); + } } - // Place src1 spad first. We use it for dyn.quant and may reuse between ops - octx->src1_spad.data = octx->ctx->vtcm_base; - octx->src0_spad.data = octx->src1_spad.data + octx->src1_spad.size; - octx->dst_spad.data = octx->src0_spad.data + octx->src0_spad.size; + HAP_compute_res_hmx_unlock(ctx->vtcm_rctx); + return 0; +} - octx->src1_spad.src = (src1 == octx->src1_spad.src) ? src1 : NULL; - octx->src0_spad.src = NULL; - octx->dst_spad.src = NULL; - octx->src0_spad.stride = src0_row_size_padded; - octx->src1_spad.stride = src1_row_size; +// --- Dispatchers and Public Entry Points --- - if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) - return HTP_STATUS_OK; +static int hmx_mm_op_matmul(struct htp_ops_context * octx, const struct htp_mm_kernel_params * kparams) { + htp_matmul_tensors_preamble; - if (need_quant && !octx->src1_spad.src) { - const uint32_t n_quant_jobs = MIN(src1_nrows, octx->n_threads); - mmctx->src1_nrows_per_thread = (src1_nrows + n_quant_jobs - 1) / n_quant_jobs; - worker_pool_run_func(octx->ctx->worker_pool, quant_job_func, mmctx, n_quant_jobs); - octx->src1_spad.src = src1; + int k = (int) src0->ne[0]; + int n = (int) src0->ne[1]; + const int m_total = (int) src1->ne[1]; + const int act_stride = (int)(src1->nb[1] / sizeof(float)); + const int wgt_stride = (int)(src0->nb[1] / sizeof(__fp16)); + + if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) { + return HTP_STATUS_OK; } - const uint32_t n_matmul_jobs = octx->n_threads; - worker_pool_run_func(octx->ctx->worker_pool, matmul_job_func, mmctx, n_matmul_jobs); + int ret = -1; + const int n_threads = MIN(kparams->n_threads, (int) octx->n_threads); + if (kparams->kernel_type == HTP_MM_KERNEL_HMX_F16_BATCHED) { + hmx_mm_f16_f32_batched_params_t batch_params = { + .dst = (float *) dst->data, + .activation = (float *) src1->data, + .weight = (const __fp16 *) src0->data, + .m = m_total, + .k = k, + .n = n, + .act_stride = act_stride, + .weight_stride = wgt_stride, + .dst_stride = (int) (dst->nb[1] / sizeof(float)), + .ne02 = ne02, + .ne03 = ne03, + .ne12 = ne12, + .ne13 = ne13, + .src0_nb2 = src0->nb[2], + .src0_nb3 = src0->nb[3], + .src1_nb2 = src1->nb[2], + .src1_nb3 = src1->nb[3], + .dst_nb2 = dst->nb[2], + .dst_nb3 = dst->nb[3], + }; + ret = hmx_mm_f16_f32_batched(octx->ctx, &batch_params, + kparams->m_chunk, kparams->n_chunk, + kparams->pipeline, n_threads, + kparams->n_act_threads, + kparams->vtcm_size); + } else { + ret = hmx_mm_2d_f32( + octx->ctx, (float*) dst->data, (float*) src1->data, (const uint8_t *) src0->data, + m_total, k, n, act_stride, (int) src0->nb[1], (int) src0->type, (int) src1->ne[0], + (int)(dst->nb[1] / sizeof(float)), (int)dst->ne[0], + kparams->m_chunk, kparams->n_chunk, kparams->pipeline, n_threads, + kparams->n_act_threads, + kparams->tile_size, kparams->aligned_tile_size, kparams->vtcm_size + ); + } + if (ret != 0) { + FARF(ERROR, "HMX matmul failed (ret=%d)\n", ret); + return HTP_STATUS_INTERNAL_ERR; + } return HTP_STATUS_OK; } int op_matmul(struct htp_ops_context * octx) { - htp_matmul_tensors_preamble; - -#ifndef HTP_HAS_HMX - return op_matmul_hvx(octx); -#else - if (!octx->ctx->hmx_enabled) { - return op_matmul_hvx(octx); - } + const struct htp_mm_kernel_params * kparams = (const struct htp_mm_kernel_params *) octx->kernel_params; - // HMX weight tile requires N to be 32-aligned. - if (src0->ne[1] % 32 != 0) { - return op_matmul_hvx(octx); + if (kparams->n_hmx) { + return hmx_mm_op_matmul(octx, kparams); } - // HMX supports F16, F32, Q4_0, Q8_0, IQ4_NL, MXFP4 weights. - // Other types fall back to HVX. - uint32_t wtype = src0->type; - if (wtype != HTP_TYPE_F16 && wtype != HTP_TYPE_F32 && wtype != HTP_TYPE_Q4_0 && wtype != HTP_TYPE_Q4_1 && wtype != HTP_TYPE_Q8_0 && wtype != HTP_TYPE_IQ4_NL && wtype != HTP_TYPE_MXFP4) { - return op_matmul_hvx(octx); - } + return hvx_mm_matmul(octx); +} - // Quantised HMX path requires K aligned to 256 (x4x2 super-block). - // F16 and F32 HMX paths require K aligned to 32 (tile width). - if (wtype != HTP_TYPE_F16 && wtype != HTP_TYPE_F32 && src0->ne[0] % 256 != 0) { - return op_matmul_hvx(octx); - } +static int hmx_mm_op_matmul_id( + struct htp_ops_context * octx, + struct htp_mm_context * mmctx, + const uint32_t * matrix_row_counts, + const struct mmid_row_mapping * matrix_rows, + void * mapping_buf, + bool must_free_mapping +) { + htp_matmul_tensors_preamble; + const struct htp_mm_kernel_params * kparams = (const struct htp_mm_kernel_params *) octx->kernel_params; + const int n_ids = octx->src[2]->ne[0]; + const int n_as = ne02; - if ((wtype == HTP_TYPE_F16 || wtype == HTP_TYPE_F32) && src0->ne[0] % 32 != 0) { - return op_matmul_hvx(octx); + for (uint32_t cur_a = 0; cur_a < n_as; ++cur_a) { + const int32_t cne1 = matrix_row_counts[cur_a]; + if (cne1 == 0) continue; + + int ret = hmx_mm_id_2d_f32(octx->ctx, (float*) dst->data, (float*) src1->data, + (const uint8_t *) src0->data + cur_a * nb02, + cne1, ne00, ne01, + ne10, + ne11, + nb11, nb12, + nb1, nb2, + (int) src0->nb[1], (int) src0->type, + matrix_rows, cur_a, n_ids * octx->src[2]->ne[1]); + if (ret != 0) { + FARF(ERROR, "HMX matmul failed for expert %u, error %d\n", cur_a, ret); + if (must_free_mapping) free(mapping_buf); + return HTP_STATUS_NO_SUPPORT; + } } - const bool is_batched = (src0->ne[2] * src0->ne[3] > 1 || src1->ne[2] * src1->ne[3] > 1); + if (must_free_mapping) free(mapping_buf); + return HTP_STATUS_OK; +} - // Quantised HMX kernels only handle flat 2D matmul (host already rejects - // batched quantised, but guard here too). F16 batched matmul is handled - // by the dedicated wrapper in hmx-matmul-ops.c. - if (is_batched && src0->type != HTP_TYPE_F16) { - return op_matmul_hvx(octx); - } +static int hvx_mm_op_matmul_id( + struct htp_ops_context * octx, + struct htp_mm_context * mmctx, + size_t src0_row_size_padded, + uint32_t src1_nrows, + worker_callback_t matmul_id_job_func, + void * mapping_buf, + bool must_free_mapping +) { + htp_matmul_tensors_preamble; + const struct htp_mm_kernel_params * kparams = (const struct htp_mm_kernel_params *) octx->kernel_params; + const struct htp_tensor * restrict ids = octx->src[2]; + const size_t src0_row_size = nb01; - // HMX assumes contiguous row-major layout. Fall back for permuted - // tensors where strides are non-monotonic (e.g. transposed KV cache). - if (src0->nb[0] > src0->nb[1] || src1->nb[0] > src1->nb[1]) { - return op_matmul_hvx(octx); - } + const uint32_t qk = QK_Q8_0_TILED; + const uint32_t nb = (ne10 + qk - 1) / qk; + const uint32_t total_nb = src1_nrows * nb; - // M alignment: Use HMX when M >= 32, the last partial tile (m_total % 32 rows) - // is handled by HMX itself; when M < 32 fall back to HVX. - const int m_total = (int) src1->ne[1]; - const int m_hmx = m_total & ~31; // 0 when M < 32 - if (m_hmx == 0) { - return op_matmul_hvx(octx); + worker_callback_t quant_job_func; + uint32_t n_quant_jobs = 1; + if (src1_nrows < octx->n_threads) { + n_quant_jobs = MIN(total_nb, octx->n_threads); + quant_job_func = (src0->type == HTP_TYPE_Q4_1) ? quantize_f32_q8_1_tiled_block : quantize_f32_q8_0_tiled_block; + for (uint32_t ith = 0; ith < n_quant_jobs; ++ith) { + uint32_t ib_first = (total_nb * ith) / n_quant_jobs; + uint32_t ib_last = (total_nb * (ith + 1)) / n_quant_jobs; + mmctx->quant_ib_first[ith] = ib_first; + mmctx->quant_ib_last[ith] = ib_last; + mmctx->quant_r[ith] = ib_first / nb; + mmctx->quant_c[ith] = ib_first % nb; + } + } else { + n_quant_jobs = MIN(src1_nrows, octx->n_threads); + quant_job_func = (src0->type == HTP_TYPE_Q4_1) ? quantize_f32_q8_1_tiled : quantize_f32_q8_0_tiled; } + size_t src1_row_size = (src0->type == HTP_TYPE_Q4_1) ? htp_mm_q8_1_tiled_row_size(ne10) : htp_mm_q8_0_tiled_row_size(ne10); - // Always re-quantize src1 since HMX kernel overwrites vtcm/spad, - // so any previously cached quantized data is invalid. - octx->src1_spad.src = NULL; + // Scratchpad sizes are computed on the host (htp_mm_hvx_id_get_vtcm_sizes) and passed in. + // The ID layout is routing-independent, so the host has exact visibility -- consume it here + // rather than recomputing, to keep host budgeting and device allocation in lockstep. + size_t src0_sz = kparams->vtcm_src0_size; + size_t src1_sz = kparams->vtcm_src1_size; + size_t src2_sz = 0; // mapping lives in DDR + size_t dst_sz = 0; // ID kernels scatter straight to DDR + size_t vtcm_size = kparams->vtcm_size; - int k = (int) src0->ne[0]; // inner dimension - int n = (int) src0->ne[1]; // weight columns + size_t src0_sz_per_thread = src0_sz / octx->n_threads; + size_t src1_sz_per_thread = src1_sz; + size_t src2_sz_per_thread = 0; + size_t dst_sz_per_thread = 0; - int ret = -1; + FARF(HIGH, "matmul-id-%s : src0-spad-size %zu src1-spad-size %zu src2-spad-size %zu dst-spad-size %zu (%zu)\n", mmctx->type, + src0_sz, src1_sz, src2_sz, dst_sz, vtcm_size); - // Row strides in elements. For compact tensors these equal k; for - // permuted attention views they can be larger, so pass the real stride. - const int act_stride = (int)(src1->nb[1] / sizeof(float)); - const int wgt_stride = (int)(src0->nb[1] / sizeof(__fp16)); + FARF(HIGH, "matmul-id-%s : %ux%ux%ux%u * %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u (0x%p, 0x%p, 0x%p)\n", mmctx->type, + src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], + ids->ne[0], ids->ne[1], ids->ne[2], ids->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], src0->data, + src1->data, dst->data); - if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) { - return HTP_STATUS_OK; + // Make sure the reserved vtcm size is sufficient + if (octx->ctx->vtcm_size < vtcm_size) { + FARF(ERROR, "matmul-id-%s : current VTCM reservation %zu is too small, needed %zu\n", mmctx->type, octx->ctx->vtcm_size, vtcm_size); + if (must_free_mapping) free(mapping_buf); + return HTP_STATUS_VTCM_TOO_SMALL; } - if (is_batched) { - if (src0->type == HTP_TYPE_F16) { - hmx_matmul_f16_f32_batched_params_t batch_params = { - .dst = (float *) dst->data, - .activation = (float *) src1->data, - .permuted_weight = (const __fp16 *) src0->data, - .m = m_total, - .k = k, - .n = n, - .act_stride = act_stride, - .weight_stride = wgt_stride, - .dst_stride = (int) (dst->nb[1] / sizeof(float)), - .ne02 = ne02, - .ne03 = ne03, - .ne12 = ne12, - .ne13 = ne13, - .src0_nb2 = src0->nb[2], - .src0_nb3 = src0->nb[3], - .src1_nb2 = src1->nb[2], - .src1_nb3 = src1->nb[3], - .dst_nb2 = dst->nb[2], - .dst_nb3 = dst->nb[3], - }; - ret = hmx_matmul_f16_f32_batched(octx->ctx, &batch_params); - } else { - return op_matmul_hvx(octx); - } - } else { - ret = hmx_matmul_2d_f32(octx->ctx, (float*) dst->data, (float*) src1->data, (const uint8_t *) src0->data, - m_total, k, n, act_stride, (int) src0->nb[1], (int) src0->type); - } + uint8_t * vtcm_ptr = (uint8_t *) octx->ctx->vtcm_base; + mmctx->vtcm_src1 = vtcm_seq_alloc(&vtcm_ptr, src1_sz); + mmctx->vtcm_src0 = vtcm_seq_alloc(&vtcm_ptr, src0_sz); + mmctx->vtcm_src2 = vtcm_seq_alloc(&vtcm_ptr, src2_sz); + mmctx->vtcm_dst = vtcm_seq_alloc(&vtcm_ptr, dst_sz); - if (ret != 0) { - FARF(HIGH, "HMX matmul failed (ret=%d), falling back to HVX", ret); - return op_matmul(octx); - } + octx->src1_spad.src = NULL; + octx->src0_spad.src = NULL; + octx->src2_spad.src = NULL; + octx->dst_spad.src = NULL; - return 0; -#endif // HTP_HAS_HMX + mmctx->vtcm_src0_stride = src0_row_size_padded; + mmctx->vtcm_src1_stride = src1_row_size; + + mmctx->vtcm_src0_size_per_thread = src0_sz_per_thread; + mmctx->vtcm_src1_size_per_thread = src1_sz_per_thread; + mmctx->vtcm_src2_size_per_thread = src2_sz_per_thread; + mmctx->vtcm_dst_size_per_thread = dst_sz_per_thread; + + mmctx->src1_nrows_per_thread = (src1_nrows + n_quant_jobs - 1) / n_quant_jobs; + worker_pool_run_func(octx->ctx->worker_pool, quant_job_func, mmctx, n_quant_jobs); + + const uint32_t n_matmul_jobs = octx->n_threads; + worker_pool_run_func(octx->ctx->worker_pool, matmul_id_job_func, mmctx, n_matmul_jobs); + + if (must_free_mapping) free(mapping_buf); + return HTP_STATUS_OK; } int op_matmul_id(struct htp_ops_context * octx) { htp_matmul_tensors_preamble; - struct htp_matmul_context mmctx_struct = {0}; - struct htp_matmul_context * mmctx = &mmctx_struct; + struct htp_mm_context mmctx_struct = {0}; + struct htp_mm_context * mmctx = &mmctx_struct; mmctx->octx = octx; + const struct htp_mm_kernel_params * kparams = (const struct htp_mm_kernel_params *) octx->kernel_params; + const struct htp_tensor * restrict ids = octx->src[2]; const size_t src0_row_size = nb01; @@ -4793,14 +3260,11 @@ int op_matmul_id(struct htp_ops_context * octx) { const uint32_t src1_nrows = ne11 * ne12 * ne13; worker_callback_t quant_job_func; - worker_callback_t matmul_id_job_func = src1_nrows > 1 ? matmul_id : matvec_id; + worker_callback_t matmul_id_job_func = src1_nrows > 1 ? hvx_mm_id : hvx_mv_id; // Compute src0_nrows_per_thread mmctx->src0_nrows_per_thread = (src0_nrows + octx->n_threads - 1) / octx->n_threads; - mmctx->src0_nrows_per_thread += (mmctx->src0_nrows_per_thread & 1); // round up to even - - size_t src1_row_size; - size_t src1_row_size_padded; + mmctx->src0_nrows_per_thread = hex_round_up(mmctx->src0_nrows_per_thread, 32); // row groups const int n_ids = ids->ne[0]; // n_expert_used @@ -4829,130 +3293,324 @@ int op_matmul_id(struct htp_ops_context * octx) { mmctx->matrix_row_counts = matrix_row_counts; mmctx->matrix_rows = matrix_rows; + mmctx->mm_div_ne11 = kparams->div_ne11; - if (htp_mminit_vec_dot(mmctx, src0->type) != 0) { + if (hvx_mm_init_vec_dot(mmctx, src0->type) != 0) { if (must_free_mapping) free(mapping_buf); return HTP_STATUS_NO_SUPPORT; } - if (src0->type == HTP_TYPE_Q4_1) { - quant_job_func = quantize_f32_q8_1x4x2; - src1_row_size = q8_1x4x2_row_size(ne10); + if (src1_nrows > 1) { + // initialize matrix_row_counts and map + memset(matrix_row_counts, 0, n_as * sizeof(uint32_t)); + + // group rows by src0 matrix + for (uint32_t iid1 = 0; iid1 < ids->ne[1]; ++iid1) { // token idx + for (uint32_t id = 0; id < n_ids; ++id) { // expert idx + const int32_t i02 = *(const int32_t *) ((const uint8_t *) ids->data + iid1 * ids->nb[1] + id * ids->nb[0]); + + if (i02 < 0) { + continue; + } + assert(i02 < n_as); + + matrix_rows[i02 * n_ids * ids->ne[1] + matrix_row_counts[i02]] = (struct mmid_row_mapping) { id, iid1 }; + matrix_row_counts[i02] += 1; + } + } + } + + if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) { + if (must_free_mapping) free(mapping_buf); + return HTP_STATUS_OK; + } + + if (kparams->n_hmx) { + return hmx_mm_op_matmul_id(octx, mmctx, matrix_row_counts, matrix_rows, mapping_buf, must_free_mapping); + } + + return hvx_mm_op_matmul_id(octx, mmctx, src0_row_size_padded, src1_nrows, matmul_id_job_func, mapping_buf, must_free_mapping); +} + +int op_matmul_qkv(struct htp_ops_context * octx) { + const struct htp_tensor * restrict src0 = octx->src[0]; // Wk + const struct htp_tensor * restrict src1 = octx->src[1]; // x + const struct htp_tensor * restrict src2 = octx->src[2]; // Wv + const struct htp_tensor * restrict src3 = octx->src[3]; // Wq + const struct htp_tensor * restrict dst_k = octx->dsts[0]; + const struct htp_tensor * restrict dst_v = octx->dsts[1]; + const struct htp_tensor * restrict dst_q = octx->dsts[2]; + + bool is_repacked = (src0->type == HTP_TYPE_Q4_0 || src0->type == HTP_TYPE_Q4_1 || + src0->type == HTP_TYPE_Q8_0 || src0->type == HTP_TYPE_IQ4_NL || + src0->type == HTP_TYPE_MXFP4); + + struct htp_mm_context mmctx_struct = {0}; + struct htp_mm_context * mmctx = &mmctx_struct; + mmctx->octx = octx; + + const struct htp_mm_kernel_params * kparams = (const struct htp_mm_kernel_params *) octx->kernel_params; + + const uint32_t src0_nrows = src0->ne[1] * src0->ne[2] * src0->ne[3]; + const uint32_t src1_nrows = src1->ne[1] * src1->ne[2] * src1->ne[3]; + + // Compute src0_nrows_per_thread + mmctx->src0_nrows_per_thread = (src0_nrows + octx->n_threads - 1) / octx->n_threads; + if (is_repacked) { + mmctx->src0_nrows_per_thread = hex_round_up(mmctx->src0_nrows_per_thread, 32); } else { - quant_job_func = quantize_f32_q8x4x2; - src1_row_size = q8x4x2_row_size(ne10); + mmctx->src0_nrows_per_thread += (mmctx->src0_nrows_per_thread & 1); // round up to even } - const size_t src2_spad_size_per_thread = 0; // We moved the mapping to DDR! - htp_mminit_spad(octx, dst_row_size, src0_row_size_padded, src1_row_size, src1_nrows, src2_spad_size_per_thread); + const size_t src0_row_size = src0->nb[1]; + const size_t src0_row_size_padded = hex_round_up(src0_row_size, 128); - size_t spad_size = octx->src2_spad.size + octx->src1_spad.size + octx->src0_spad.size + octx->dst_spad.size; + if (hvx_mm_init_vec_dot(mmctx, src0->type) != 0) { + return HTP_STATUS_NO_SUPPORT; + } - FARF(HIGH, "matmul-id-%s : src0-spad-size %u src1-spad-size %u src2-spad-size %u dst-spad-size %u (%zu)\n", mmctx->type, - octx->src0_spad.size, octx->src1_spad.size, octx->src2_spad.size, octx->dst_spad.size, spad_size); + const uint32_t qk = QK_Q8_0_TILED; + const uint32_t nb = (src1->ne[0] + qk - 1) / qk; + const uint32_t total_nb = src1_nrows * nb; - FARF(HIGH, "matmul-id-%s : %ux%ux%ux%u * %ux%ux%ux%u (%ux%ux%ux%u) -> %ux%ux%ux%u (0x%p, 0x%p, 0x%p)\n", mmctx->type, - src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3], - ids->ne[0], ids->ne[1], ids->ne[2], ids->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], src0->data, - src1->data, dst->data); + worker_callback_t quant_job_func; + uint32_t n_quant_jobs = 1; + if (kparams->kernel_type == HTP_MM_KERNEL_HVX_QUANT_ROW_FLAT) { + n_quant_jobs = MIN(src1_nrows, octx->n_threads); + quant_job_func = (src0->type == HTP_TYPE_Q4_1) ? quantize_f32_q8_1_flat : quantize_f32_q8_0_flat; + } else if (src1_nrows < octx->n_threads) { + n_quant_jobs = MIN(total_nb, octx->n_threads); + quant_job_func = (src0->type == HTP_TYPE_Q4_1) ? quantize_f32_q8_1_tiled_block : quantize_f32_q8_0_tiled_block; + for (uint32_t ith = 0; ith < n_quant_jobs; ++ith) { + uint32_t ib_first = (total_nb * ith) / n_quant_jobs; + uint32_t ib_last = (total_nb * (ith + 1)) / n_quant_jobs; + mmctx->quant_ib_first[ith] = ib_first; + mmctx->quant_ib_last[ith] = ib_last; + mmctx->quant_r[ith] = ib_first / nb; + mmctx->quant_c[ith] = ib_first % nb; + } + } else { + n_quant_jobs = MIN(src1_nrows, octx->n_threads); + quant_job_func = (src0->type == HTP_TYPE_Q4_1) ? quantize_f32_q8_1_tiled : quantize_f32_q8_0_tiled; + } - // Make sure the reserved vtcm size is sufficient - if (octx->ctx->vtcm_size < spad_size) { - FARF(ERROR, "matmul-id-%s : current VTCM reservation %zu is too small, needed %zu\n", mmctx->type, octx->ctx->vtcm_size, spad_size); - if (must_free_mapping) free(mapping_buf); + size_t src1_row_size; + if (kparams->kernel_type == HTP_MM_KERNEL_HVX_QUANT_ROW_FLAT) { + src1_row_size = (src0->type == HTP_TYPE_Q4_1) ? htp_mm_q8_1_flat_row_size(src1->ne[0]) : htp_mm_q8_0_flat_row_size(src1->ne[0]); + } else { + src1_row_size = (src0->type == HTP_TYPE_Q4_1) ? htp_mm_q8_1_tiled_row_size(src1->ne[0]) : htp_mm_q8_0_tiled_row_size(src1->ne[0]); + } + + // Set up scratchpads using precomputed sizes from the host + size_t src0_sz = kparams->vtcm_src0_size; + size_t src1_sz = kparams->vtcm_src1_size; + size_t src2_sz = kparams->vtcm_src2_size; + size_t src3_sz = kparams->vtcm_src3_size; + size_t vtcm_size = kparams->vtcm_size; + + size_t src0_sz_per_thread = src0_sz / octx->n_threads; + size_t src1_sz_per_thread = src1_sz; + size_t src2_sz_per_thread = src2_sz / octx->n_threads; + size_t src3_sz_per_thread = src3_sz / octx->n_threads; + + if (octx->ctx->vtcm_size < vtcm_size) { + FARF(ERROR, "matmul-qkv: current VTCM reservation %zu is too small, needed %zu\n", + octx->ctx->vtcm_size, vtcm_size); return HTP_STATUS_VTCM_TOO_SMALL; } - // Place src1 spad first. We use it for dyn.quant and may reuse in subseq ops. - octx->src1_spad.data = octx->ctx->vtcm_base; - octx->src0_spad.data = octx->src1_spad.data + octx->src1_spad.size; - octx->src2_spad.data = octx->src0_spad.data + octx->src0_spad.size; - octx->dst_spad.data = octx->src2_spad.data + octx->src2_spad.size; + uint8_t * vtcm_ptr = (uint8_t *) octx->ctx->vtcm_base; + mmctx->vtcm_src1 = vtcm_seq_alloc(&vtcm_ptr, src1_sz); + mmctx->vtcm_src0 = vtcm_seq_alloc(&vtcm_ptr, src0_sz); + mmctx->vtcm_src2 = vtcm_seq_alloc(&vtcm_ptr, src2_sz); + mmctx->vtcm_src3 = vtcm_seq_alloc(&vtcm_ptr, src3_sz); - octx->src1_spad.src = (src1 == octx->src1_spad.src) ? src1 : NULL; + octx->src1_spad.src = NULL; octx->src0_spad.src = NULL; octx->src2_spad.src = NULL; - octx->dst_spad.src = NULL; + octx->src3_spad.src = NULL; - octx->src0_spad.stride = src0_row_size_padded; - octx->src1_spad.stride = src1_row_size; + mmctx->vtcm_src0_stride = is_repacked ? 0 : src0_row_size_padded; + mmctx->vtcm_src2_stride = is_repacked ? 0 : src0_row_size_padded; + mmctx->vtcm_src3_stride = is_repacked ? 0 : src0_row_size_padded; + mmctx->vtcm_src1_stride = src1_row_size; - if (src1_nrows > 1) { - // initialize matrix_row_counts and map - memset(matrix_row_counts, 0, n_as * sizeof(uint32_t)); + mmctx->vtcm_src0_size_per_thread = src0_sz_per_thread; + mmctx->vtcm_src1_size_per_thread = src1_sz_per_thread; + mmctx->vtcm_src2_size_per_thread = src2_sz_per_thread; + mmctx->vtcm_src3_size_per_thread = src3_sz_per_thread; - // group rows by src0 matrix - for (uint32_t iid1 = 0; iid1 < ids->ne[1]; ++iid1) { // token idx - for (uint32_t id = 0; id < n_ids; ++id) { // expert idx - const uint32_t i02 = *(const uint32_t *) ((const uint8_t *) ids->data + iid1 * ids->nb[1] + id * ids->nb[0]); + if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) + return HTP_STATUS_OK; - assert(i02 >= 0 && i02 < n_as); + // Run quantization once + mmctx->src1_nrows_per_thread = (src1_nrows + n_quant_jobs - 1) / n_quant_jobs; + worker_pool_run_func(octx->ctx->worker_pool, quant_job_func, mmctx, n_quant_jobs); - matrix_rows[i02 * n_ids * ids->ne[1] + matrix_row_counts[i02]] = (struct mmid_row_mapping) { id, iid1 }; - matrix_row_counts[i02] += 1; + // Run fused matmul + const uint32_t n_matmul_jobs = octx->n_threads; + worker_callback_t matmul_job_func; + if (is_repacked) { + if (kparams->kernel_type == HTP_MM_KERNEL_HVX_QUANT_ROW_FLAT) { + switch (src0->type) { + case HTP_TYPE_Q4_0: matmul_job_func = hvx_mm_qkv_2d_repacked_q4_0_flat; break; + case HTP_TYPE_Q4_1: matmul_job_func = hvx_mm_qkv_2d_repacked_q4_1_flat; break; + case HTP_TYPE_Q8_0: matmul_job_func = hvx_mm_qkv_2d_repacked_q8_0_flat; break; + case HTP_TYPE_IQ4_NL: matmul_job_func = hvx_mm_qkv_2d_repacked_iq4nl_flat; break; + case HTP_TYPE_MXFP4: matmul_job_func = hvx_mm_qkv_2d_repacked_mxfp4_flat; break; + default: return HTP_STATUS_NO_SUPPORT; + } + } else { + switch (src0->type) { + case HTP_TYPE_Q4_0: matmul_job_func = hvx_mm_qkv_2d_repacked_q4_0; break; + case HTP_TYPE_Q4_1: matmul_job_func = hvx_mm_qkv_2d_repacked_q4_1; break; + case HTP_TYPE_Q8_0: matmul_job_func = hvx_mm_qkv_2d_repacked_q8_0; break; + case HTP_TYPE_IQ4_NL: matmul_job_func = hvx_mm_qkv_2d_repacked_iq4nl; break; + case HTP_TYPE_MXFP4: matmul_job_func = hvx_mm_qkv_2d_repacked_mxfp4; break; + default: return HTP_STATUS_NO_SUPPORT; } } + } else { + matmul_job_func = hvx_mm_qkv_2d; } + worker_pool_run_func(octx->ctx->worker_pool, matmul_job_func, mmctx, n_matmul_jobs); - if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) { - if (must_free_mapping) free(mapping_buf); - return HTP_STATUS_OK; + return HTP_STATUS_OK; +} + +int op_matmul_ffn(struct htp_ops_context * octx) { + const struct htp_tensor * restrict src0 = octx->src[0]; // Wgate + const struct htp_tensor * restrict src1 = octx->src[1]; // y + const struct htp_tensor * restrict src2 = octx->src[2]; // Wup + const struct htp_tensor * restrict dst_gate = octx->dsts[0]; + const struct htp_tensor * restrict dst_up = octx->dsts[1]; + + bool is_repacked = (src0->type == HTP_TYPE_Q4_0 || src0->type == HTP_TYPE_Q4_1 || + src0->type == HTP_TYPE_Q8_0 || src0->type == HTP_TYPE_IQ4_NL || + src0->type == HTP_TYPE_MXFP4); + + struct htp_mm_context mmctx_struct = {0}; + struct htp_mm_context * mmctx = &mmctx_struct; + mmctx->octx = octx; + + const struct htp_mm_kernel_params * kparams = (const struct htp_mm_kernel_params *) octx->kernel_params; + + const uint32_t src0_nrows = src0->ne[1] * src0->ne[2] * src0->ne[3]; + const uint32_t src1_nrows = src1->ne[1] * src1->ne[2] * src1->ne[3]; + + // Compute src0_nrows_per_thread + mmctx->src0_nrows_per_thread = (src0_nrows + octx->n_threads - 1) / octx->n_threads; + if (is_repacked) { + mmctx->src0_nrows_per_thread = hex_round_up(mmctx->src0_nrows_per_thread, 32); + } else { + mmctx->src0_nrows_per_thread += (mmctx->src0_nrows_per_thread & 1); // round up to even } - bool hmx_eligible = false; -#ifdef HTP_HAS_HMX - if (octx->ctx->hmx_enabled && src1_nrows > 1) { - uint32_t wtype = src0->type; - if (ne01 % 32 == 0 && - (wtype == HTP_TYPE_F16 || wtype == HTP_TYPE_F32 || wtype == HTP_TYPE_Q4_0 || wtype == HTP_TYPE_Q4_1 || wtype == HTP_TYPE_Q8_0 || wtype == HTP_TYPE_IQ4_NL || wtype == HTP_TYPE_MXFP4)) { - if ((wtype == HTP_TYPE_F16 || wtype == HTP_TYPE_F32) && ne00 % 32 == 0) { - hmx_eligible = true; - } else if (wtype != HTP_TYPE_F16 && wtype != HTP_TYPE_F32 && ne00 % 256 == 0) { - hmx_eligible = true; - } - } + const size_t src0_row_size = src0->nb[1]; + const size_t src0_row_size_padded = hex_round_up(src0_row_size, 128); + + if (hvx_mm_init_vec_dot(mmctx, src0->type) != 0) { + return HTP_STATUS_NO_SUPPORT; } -#endif - mmctx->hmx_eligible = hmx_eligible; - - if (hmx_eligible) { - for (uint32_t cur_a = 0; cur_a < n_as; ++cur_a) { - const int32_t cne1 = matrix_row_counts[cur_a]; - if (cne1 == 0) continue; - - int ret = hmx_matmul_id_2d_f32(octx->ctx, (float*) dst->data, (float*) src1->data, - (const uint8_t *) src0->data + cur_a * nb02, - cne1, ne00, ne01, - ne11, - nb11, nb12, - nb1, nb2, - (int) src0->nb[1], (int) src0->type, - matrix_rows, cur_a, n_ids * ids->ne[1]); - if (ret != 0) { - FARF(ERROR, "HMX matmul failed for expert %u, error %d\n", cur_a, ret); - if (must_free_mapping) free(mapping_buf); - return HTP_STATUS_NO_SUPPORT; - } - } + const uint32_t qk = QK_Q8_0_TILED; + const uint32_t nb = (src1->ne[0] + qk - 1) / qk; + const uint32_t total_nb = src1_nrows * nb; - // HMX has overwritten VTCM, so force dynamic quantization cache to clear - octx->src1_spad.src = NULL; + worker_callback_t quant_job_func; + uint32_t n_quant_jobs = 1; + if (kparams->kernel_type == HTP_MM_KERNEL_HVX_QUANT_ROW_FLAT) { + n_quant_jobs = MIN(src1_nrows, octx->n_threads); + quant_job_func = (src0->type == HTP_TYPE_Q4_1) ? quantize_f32_q8_1_flat : quantize_f32_q8_0_flat; + } else if (src1_nrows < octx->n_threads) { + n_quant_jobs = MIN(total_nb, octx->n_threads); + quant_job_func = (src0->type == HTP_TYPE_Q4_1) ? quantize_f32_q8_1_tiled_block : quantize_f32_q8_0_tiled_block; + for (uint32_t ith = 0; ith < n_quant_jobs; ++ith) { + uint32_t ib_first = (total_nb * (ith + 0)) / n_quant_jobs; + uint32_t ib_last = (total_nb * (ith + 1)) / n_quant_jobs; + mmctx->quant_ib_first[ith] = ib_first; + mmctx->quant_ib_last[ith] = ib_last; + mmctx->quant_r[ith] = ib_first / nb; + mmctx->quant_c[ith] = ib_first % nb; + } + } else { + n_quant_jobs = MIN(src1_nrows, octx->n_threads); + quant_job_func = (src0->type == HTP_TYPE_Q4_1) ? quantize_f32_q8_1_tiled : quantize_f32_q8_0_tiled; + } - if (must_free_mapping) free(mapping_buf); - return HTP_STATUS_OK; + size_t src1_row_size; + if (kparams->kernel_type == HTP_MM_KERNEL_HVX_QUANT_ROW_FLAT) { + src1_row_size = (src0->type == HTP_TYPE_Q4_1) ? htp_mm_q8_1_flat_row_size(src1->ne[0]) : htp_mm_q8_0_flat_row_size(src1->ne[0]); + } else { + src1_row_size = (src0->type == HTP_TYPE_Q4_1) ? htp_mm_q8_1_tiled_row_size(src1->ne[0]) : htp_mm_q8_0_tiled_row_size(src1->ne[0]); } - if (octx->src1_spad.src != src1) { - const uint32_t n_quant_jobs = MIN(src1_nrows, octx->n_threads); - mmctx->src1_nrows_per_thread = (src1_nrows + n_quant_jobs - 1) / n_quant_jobs; - worker_pool_run_func(octx->ctx->worker_pool, quant_job_func, mmctx, n_quant_jobs); - octx->src1_spad.src = src1; + // Set up scratchpads using precomputed sizes from the host + size_t src0_sz = kparams->vtcm_src0_size; + size_t src1_sz = kparams->vtcm_src1_size; + size_t src2_sz = kparams->vtcm_src2_size; + size_t vtcm_size = kparams->vtcm_size; + + size_t src0_sz_per_thread = src0_sz / octx->n_threads; + size_t src1_sz_per_thread = src1_sz; + size_t src2_sz_per_thread = src2_sz / octx->n_threads; + + if (octx->ctx->vtcm_size < vtcm_size) { + FARF(ERROR, "matmul-ffn: current VTCM reservation %zu is too small, needed %zu\n", octx->ctx->vtcm_size, vtcm_size); + return HTP_STATUS_VTCM_TOO_SMALL; } + uint8_t * vtcm_ptr = (uint8_t *) octx->ctx->vtcm_base; + mmctx->vtcm_src1 = vtcm_seq_alloc(&vtcm_ptr, src1_sz); + mmctx->vtcm_src0 = vtcm_seq_alloc(&vtcm_ptr, src0_sz); + mmctx->vtcm_src2 = vtcm_seq_alloc(&vtcm_ptr, src2_sz); + + octx->src1_spad.src = NULL; + octx->src0_spad.src = NULL; + octx->src2_spad.src = NULL; + + mmctx->vtcm_src0_stride = is_repacked ? 0 : src0_row_size_padded; + mmctx->vtcm_src2_stride = is_repacked ? 0 : src0_row_size_padded; + mmctx->vtcm_src1_stride = src1_row_size; + + mmctx->vtcm_src0_size_per_thread = src0_sz_per_thread; + mmctx->vtcm_src1_size_per_thread = src1_sz_per_thread; + mmctx->vtcm_src2_size_per_thread = src2_sz_per_thread; + + if (octx->flags & HTP_OPFLAGS_SKIP_COMPUTE) + return HTP_STATUS_OK; + + // Run quantization once + mmctx->src1_nrows_per_thread = (src1_nrows + n_quant_jobs - 1) / n_quant_jobs; + worker_pool_run_func(octx->ctx->worker_pool, quant_job_func, mmctx, n_quant_jobs); + + // Run fused matmul const uint32_t n_matmul_jobs = octx->n_threads; - worker_pool_run_func(octx->ctx->worker_pool, matmul_id_job_func, mmctx, n_matmul_jobs); + worker_callback_t matmul_job_func; + if (is_repacked) { + if (kparams->kernel_type == HTP_MM_KERNEL_HVX_QUANT_ROW_FLAT) { + switch (src0->type) { + case HTP_TYPE_Q4_0: matmul_job_func = hvx_mm_ffn_2d_repacked_q4_0_flat; break; + case HTP_TYPE_Q4_1: matmul_job_func = hvx_mm_ffn_2d_repacked_q4_1_flat; break; + case HTP_TYPE_Q8_0: matmul_job_func = hvx_mm_ffn_2d_repacked_q8_0_flat; break; + case HTP_TYPE_IQ4_NL: matmul_job_func = hvx_mm_ffn_2d_repacked_iq4nl_flat; break; + case HTP_TYPE_MXFP4: matmul_job_func = hvx_mm_ffn_2d_repacked_mxfp4_flat; break; + default: return HTP_STATUS_NO_SUPPORT; + } + } else { + switch (src0->type) { + case HTP_TYPE_Q4_0: matmul_job_func = hvx_mm_ffn_2d_repacked_q4_0; break; + case HTP_TYPE_Q4_1: matmul_job_func = hvx_mm_ffn_2d_repacked_q4_1; break; + case HTP_TYPE_Q8_0: matmul_job_func = hvx_mm_ffn_2d_repacked_q8_0; break; + case HTP_TYPE_IQ4_NL: matmul_job_func = hvx_mm_ffn_2d_repacked_iq4nl; break; + case HTP_TYPE_MXFP4: matmul_job_func = hvx_mm_ffn_2d_repacked_mxfp4; break; + default: return HTP_STATUS_NO_SUPPORT; + } + } + } else { + matmul_job_func = hvx_mm_ffn_2d; + } + worker_pool_run_func(octx->ctx->worker_pool, matmul_job_func, mmctx, n_matmul_jobs); - if (must_free_mapping) free(mapping_buf); return HTP_STATUS_OK; } diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.h b/ggml/src/ggml-hexagon/htp/matmul-ops.h new file mode 100644 index 000000000000..a94d5430dabc --- /dev/null +++ b/ggml/src/ggml-hexagon/htp/matmul-ops.h @@ -0,0 +1,508 @@ +#ifndef HTP_MATMUL_OPS_H +#define HTP_MATMUL_OPS_H + +#include +#include +#include "htp-ops.h" +#include "hex-fastdiv.h" +#include "hex-common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// --- HMX Tile Constraints --- +#define HTP_MM_HMX_TILE_N_COLS 32 +#define HTP_MM_HMX_TILE_N_ROWS 32 +#define HTP_MM_HMX_TILE_SIZE (32 * 32 * sizeof(__fp16)) // 2048 bytes +#define HTP_MM_HMX_TILE_N_ELMS 1024 +#define HTP_MM_HMX_MIN_NROWS 4 + +// --- Weight Repacked Tile Sizes --- +#define HTP_MM_WEIGHT_TILE_SIZE_Q4_0 576 +#define HTP_MM_WEIGHT_TILE_SIZE_Q4_1 640 +#define HTP_MM_WEIGHT_TILE_SIZE_Q8_0 1088 +#define HTP_MM_WEIGHT_TILE_SIZE_IQ4_NL 576 +#define HTP_MM_WEIGHT_TILE_SIZE_MXFP4 544 + +// --- Weight Repacked Aligned Tile Sizes --- +#define HTP_MM_WEIGHT_ALIGNED_TILE_SIZE_Q4_0 640 +#define HTP_MM_WEIGHT_ALIGNED_TILE_SIZE_Q4_1 640 +#define HTP_MM_WEIGHT_ALIGNED_TILE_SIZE_Q8_0 1152 +#define HTP_MM_WEIGHT_ALIGNED_TILE_SIZE_IQ4_NL 640 +#define HTP_MM_WEIGHT_ALIGNED_TILE_SIZE_MXFP4 640 + +// --- Activation Tiled Block Sizes (including padding) --- +#define HTP_MM_ACT_TILE_SIZE_Q8_0 1152 +#define HTP_MM_ACT_TILE_SIZE_Q8_1 1280 + +#define HTP_MM_MAX_PREFETCH 16 + +// --- Solver Cost Model Penalty Weights (HMX-specific) --- +#define HTP_MM_HMX_COST_W_DEQUANT 3 // cost penalty for quantized weight loading/dequantization +#define HTP_MM_HMX_COST_A_CONVERT 2 // cost penalty for activation loading/conversion + +// --- DMA Activation Transfer Configuration --- +#define HTP_MM_DMA_ACT_ROWS_PER_STEP 2 +#define HTP_MM_DMA_ACT_MULTIPLIER 4 + +enum htp_mm_kernel_type { + HTP_MM_KERNEL_UNSUPPORTED = 0, + + // HMX paths + HTP_MM_KERNEL_HMX_2D, + HTP_MM_KERNEL_HMX_F16_BATCHED, + + // HVX floating-point paths + HTP_MM_KERNEL_HVX_F16_F16_VTCM, + HTP_MM_KERNEL_HVX_F16_F16_DDR, + HTP_MM_KERNEL_HVX_F16_F32_DDR, + + HTP_MM_KERNEL_HVX_F32_F32_VTCM, + HTP_MM_KERNEL_HVX_F32_F32_DDR, + HTP_MM_KERNEL_HVX_F32_F16_DDR, + + // HVX quantized paths + HTP_MM_KERNEL_HVX_QUANT_ROW, // standard row-wise parallel quantization + HTP_MM_KERNEL_HVX_QUANT_BLOCK, // parallel block-wise quantization + HTP_MM_KERNEL_HVX_QUANT_ROW_FLAT, // row-wise fallback flat quantization +}; + +// Op-specific struct for precomputed matmul params +struct htp_mm_kernel_params { + int32_t kernel_type; // enum htp_mm_kernel_type + int32_t pipeline; // 1 = pipelined execution, 0 = standard + int32_t m_chunk; // Row chunk size (M chunk) + int32_t n_chunk; // Col chunk size (N chunk) + int32_t n_threads; // Number of threads to spawn + int32_t n_act_threads; // Number of threads for activation preparation + int32_t n_hmx; // 1 = use HMX, 0 = use HVX + int32_t n_prefetch; // Prefetch lookahead buffers/rows in VTCM + int32_t tile_size; // Weight tile size + int32_t aligned_tile_size; // Aligned weight tile size (padded to 128) + int32_t src1_row_size; // Row size for quantized activation + int32_t vtcm_size; // Total required scratchpad size in VTCM + int32_t vtcm_src0_size; // src0 scratchpad size in VTCM + int32_t vtcm_src1_size; // src1 scratchpad size in VTCM + int32_t vtcm_src2_size; // src2 scratchpad size in VTCM (fused only) + int32_t vtcm_src3_size; // src3 scratchpad size in VTCM (fused only) + int32_t vtcm_dst_size; // dst scratchpad size in VTCM + + // Precomputed division values + struct fastdiv_values div_ne12_ne1; + struct fastdiv_values div_ne1; + struct fastdiv_values div_r2; + struct fastdiv_values div_r3; + struct fastdiv_values div_ne11; +}; + +#if defined(__cplusplus) +static_assert(sizeof(struct htp_mm_kernel_params) <= 128, "htp_matmul_kernel_params is too large for kernel_params blob"); +#else +_Static_assert(sizeof(struct htp_mm_kernel_params) <= 128, "htp_matmul_kernel_params is too large for kernel_params blob"); +#endif + +struct mmid_row_mapping { + uint32_t i1; + uint32_t i2; +}; + +// Search for optimal (mc, nc) chunk sizes within VTCM budget. +static inline int htp_mm_hmx_compute_chunks(size_t vtcm_total, + size_t overhead, + size_t per_n_cost, + size_t per_m_cost, + size_t per_mn_cost, + size_t m, + size_t n, + size_t m_block_cost, + size_t n_block_cost, + size_t * m_chunk_out, + size_t * n_chunk_out, + size_t * total_out) { + if (m == 0 || n == 0) return -1; + if (vtcm_total <= overhead) return -1; + if (per_n_cost == 0 || per_m_cost == 0 || per_mn_cost == 0) return -1; + + const size_t usable = vtcm_total - overhead; + + size_t best_cost = SIZE_MAX; + size_t best_mn = 0; + size_t best_m = 0, best_n = 0; + + const size_t n_max = hex_align_down((size_t)n, HTP_MM_HMX_TILE_N_COLS); + for (size_t nc = n_max; nc >= HTP_MM_HMX_TILE_N_COLS; nc -= HTP_MM_HMX_TILE_N_COLS) { + size_t n_fixed = 0, ncmn = 0, mc_denom = 0; + if (hex_mul_overflow(nc, per_n_cost, &n_fixed)) continue; + if (n_fixed >= usable) goto next_nc; + + if (hex_mul_overflow(nc, per_mn_cost, &ncmn)) goto next_nc; + if (hex_add_overflow(per_m_cost, ncmn, &mc_denom) || mc_denom == 0) goto next_nc; + + { + size_t remain = usable - n_fixed; + size_t mc = remain / mc_denom; + mc = hex_align_down(mc, HTP_MM_HMX_TILE_N_ROWS); + mc = hex_smin(mc, m); + + if (mc == 0) { + goto next_nc; + } + + size_t mblocks = ((size_t) m + mc - 1) / mc; + size_t nblocks = ((size_t) n + nc - 1) / nc; + size_t cost = mblocks * m_block_cost + nblocks * n_block_cost; + size_t mn = mc * nc; + if (cost < best_cost || (cost == best_cost && mn > best_mn)) { + best_cost = cost; + best_mn = mn; + best_m = mc; + best_n = nc; + } + } + +next_nc: + if (nc == HTP_MM_HMX_TILE_N_COLS) break; // avoid size_t underflow + } + + if (best_m == 0 || best_n == 0) return -1; + + // Compute exact total (with overflow checks) + size_t t0 = 0, t1 = 0, t2 = 0, mn = 0, total = 0; + if (hex_mul_overflow(best_n, per_n_cost, &t0)) return -1; + if (hex_mul_overflow(best_m, per_m_cost, &t1)) return -1; + if (hex_mul_overflow(best_m, best_n, &mn)) return -1; + if (hex_mul_overflow(mn, per_mn_cost, &t2)) return -1; + if (hex_add_overflow(t0, t1, &total)) return -1; + if (hex_add_overflow(total, t2, &total)) return -1; + if (hex_add_overflow(total, overhead, &total)) return -1; + + *m_chunk_out = best_m; + *n_chunk_out = best_n; + *total_out = total; + return 0; +} + +// --- Tile Size Helpers --- +static inline uint32_t htp_mm_get_weight_tile_size(int weight_type) { + switch (weight_type) { + case HTP_TYPE_Q4_0: + case HTP_TYPE_IQ4_NL: + return HTP_MM_WEIGHT_TILE_SIZE_Q4_0; + case HTP_TYPE_Q4_1: + return HTP_MM_WEIGHT_TILE_SIZE_Q4_1; + case HTP_TYPE_Q8_0: + return HTP_MM_WEIGHT_TILE_SIZE_Q8_0; + case HTP_TYPE_MXFP4: + return HTP_MM_WEIGHT_TILE_SIZE_MXFP4; + default: + return 0; + } +} + +static inline uint32_t htp_mm_get_weight_aligned_tile_size(int weight_type) { + switch (weight_type) { + case HTP_TYPE_Q4_0: + case HTP_TYPE_IQ4_NL: + return HTP_MM_WEIGHT_ALIGNED_TILE_SIZE_Q4_0; + case HTP_TYPE_Q4_1: + return HTP_MM_WEIGHT_ALIGNED_TILE_SIZE_Q4_1; + case HTP_TYPE_Q8_0: + return HTP_MM_WEIGHT_ALIGNED_TILE_SIZE_Q8_0; + case HTP_TYPE_MXFP4: + return HTP_MM_WEIGHT_ALIGNED_TILE_SIZE_MXFP4; + default: + return 0; + } +} + +// --- Activation/Row Size Helpers --- +static inline size_t htp_mm_q8_0_tiled_row_size(uint32_t ne) { + const uint32_t ne_padded = ((ne + 127) / 128) * 128; + const uint32_t nb_32 = ne_padded / 32; + return nb_32 * HTP_MM_ACT_TILE_SIZE_Q8_0; +} + +static inline size_t htp_mm_q8_1_tiled_row_size(uint32_t ne) { + const uint32_t ne_padded = ((ne + 127) / 128) * 128; + const uint32_t nb_32 = ne_padded / 32; + return nb_32 * HTP_MM_ACT_TILE_SIZE_Q8_1; +} + +static inline size_t htp_mm_q8_0_flat_row_size(uint32_t ne) { + const uint32_t quants_size = hex_align_up(ne, 128); + const uint32_t num_scales = (ne + 31) / 32; + const uint32_t scales_size = hex_align_up(num_scales * 2, 128); + return quants_size + scales_size; +} + +static inline size_t htp_mm_q8_1_flat_row_size(uint32_t ne) { + const uint32_t quants_size = hex_align_up(ne, 128); + const uint32_t num_scales = (ne + 31) / 32; + const uint32_t scales_size = hex_align_up(num_scales * 4, 128); + return quants_size + scales_size; +} + +static inline size_t htp_mm_get_tiled_row_stride(int weight_type, uint32_t k) { + uint32_t nb = (k + QK_Q4_0_TILED - 1) / QK_Q4_0_TILED; + switch (weight_type) { + case HTP_TYPE_Q4_0: + case HTP_TYPE_IQ4_NL: + case HTP_TYPE_Q4_1: + case HTP_TYPE_Q8_0: + case HTP_TYPE_MXFP4: + return (size_t) nb * htp_mm_get_weight_tile_size(weight_type); + case HTP_TYPE_F16: + return (size_t) k * sizeof(__fp16); + case HTP_TYPE_F32: + return (size_t) k * sizeof(float); + default: + return 0; + } +} + +static inline size_t htp_mm_round_up(size_t n, size_t m) { + return ((n + m - 1) / m) * m; +} + +static inline bool htp_mm_hmx_pipeline(uint32_t m) { + return m > 32; +} + +static inline void htp_mm_hmx_get_2d_chunk_costs( + int wtype, uint32_t k, bool pipeline, uint32_t aligned_tile_size, + size_t * size_per_n_out, size_t * size_per_m_out, size_t * size_per_mn_out +) { + const bool is_quant = (wtype != HTP_TYPE_F16 && wtype != HTP_TYPE_F32); + const size_t row_stride = htp_mm_get_tiled_row_stride(wtype, k); + const size_t vec_dot_size = k * sizeof(uint16_t); + const uint32_t n_k_tiles = k / HTP_MM_HMX_TILE_N_COLS; + const size_t qweight_row_stride = is_quant ? (size_t)(n_k_tiles * aligned_tile_size) / 32 : 0; + + *size_per_n_out = (pipeline ? 2 : 1) * (is_quant ? qweight_row_stride : row_stride) + + (pipeline ? 2 * vec_dot_size : vec_dot_size); + *size_per_m_out = vec_dot_size; + *size_per_mn_out = (pipeline ? 2 : 1) * sizeof(uint16_t); +} + +static inline void htp_mm_hmx_get_batched_chunk_costs( + uint32_t k, uint32_t group_size, + size_t * size_per_n_out, size_t * size_per_m_out, size_t * size_per_mn_out +) { + const size_t vec_dot_size = k * sizeof(uint16_t); + *size_per_n_out = 3 * vec_dot_size; + *size_per_m_out = group_size * vec_dot_size; + *size_per_mn_out = sizeof(uint16_t); +} + +static inline size_t htp_mm_hmx_get_2d_vtcm_size( + int wtype, uint32_t k, size_t mc, size_t nc, bool pipeline, uint32_t act_threads, uint32_t aligned_tile_size +) { + const uint32_t n_k_tiles = k / HTP_MM_HMX_TILE_N_COLS; + const bool is_quant = (wtype != HTP_TYPE_F16 && wtype != HTP_TYPE_F32); + const size_t row_stride = htp_mm_get_tiled_row_stride(wtype, k); + const size_t vec_dot_size = k * sizeof(uint16_t); + + const size_t act_f32_size = htp_mm_round_up(act_threads * 4 * k * sizeof(float), HTP_MM_HMX_TILE_SIZE); + size_t weight_area_size = is_quant + ? htp_mm_round_up((nc / 32) * n_k_tiles * aligned_tile_size, HTP_MM_HMX_TILE_SIZE) + : htp_mm_round_up(nc * row_stride, HTP_MM_HMX_TILE_SIZE); + if (pipeline) { + weight_area_size *= 2; + } + const size_t act_area_size = htp_mm_round_up(mc * vec_dot_size, HTP_MM_HMX_TILE_SIZE); + const size_t output_area_size = htp_mm_round_up(mc * nc * sizeof(uint16_t), HTP_MM_HMX_TILE_SIZE); + + size_t scratch0_size = htp_mm_round_up(nc * vec_dot_size, HTP_MM_HMX_TILE_SIZE); + size_t scratch1_size = pipeline ? scratch0_size : 0; + size_t scratch2_size = pipeline ? output_area_size : 0; + + return weight_area_size + act_area_size + act_f32_size + output_area_size + + scratch0_size + scratch1_size + scratch2_size + 256; +} + +static inline size_t htp_mm_hmx_get_batched_vtcm_size( + int wtype, uint32_t k, size_t mc, size_t nc, uint32_t group_size, bool use_dma_activation, bool pipeline, uint32_t act_threads) { + (void)wtype; + (void)pipeline; + const size_t vec_dot_size = k * sizeof(uint16_t); + const size_t f32_scratch_size = use_dma_activation + ? htp_mm_round_up(act_threads * 4 * k * sizeof(float), HTP_MM_HMX_TILE_SIZE) : 0; + + const size_t act_head_stride = mc * k; + const size_t weight_area_size = htp_mm_round_up(nc * vec_dot_size, HTP_MM_HMX_TILE_SIZE); + const size_t act_area_size = htp_mm_round_up(group_size * act_head_stride * sizeof(uint16_t), HTP_MM_HMX_TILE_SIZE); + const size_t output_area_size = htp_mm_round_up(group_size * mc * nc * sizeof(uint16_t), HTP_MM_HMX_TILE_SIZE); + const size_t scratch_area_size = htp_mm_round_up(nc * vec_dot_size, HTP_MM_HMX_TILE_SIZE); + + return weight_area_size + act_area_size + output_area_size + + 2 * scratch_area_size + 256 + f32_scratch_size; +} + +static inline size_t htp_mm_hvx_get_vtcm_sizes( + int kernel_type, + int wtype, + uint32_t ne10, // k + uint32_t src1_nrows, // m_total (or act_nrows) + uint32_t n_threads, + size_t dst_row_size, + size_t src0_row_size, + size_t src1_row_size, + uint32_t n_prefetch, + size_t * vtcm_src0_size_out, + size_t * vtcm_src1_size_out, + size_t * vtcm_dst_size_out +) { + size_t vtcm_src0_size = 0; + size_t vtcm_src1_size = 0; + size_t vtcm_dst_size = 0; + + const bool is_repack = (wtype == HTP_TYPE_Q4_0 || wtype == HTP_TYPE_Q4_1 || + wtype == HTP_TYPE_Q8_0 || wtype == HTP_TYPE_IQ4_NL || + wtype == HTP_TYPE_MXFP4); + + const size_t src0_row_size_padded = htp_mm_round_up(src0_row_size, 128); + const size_t dst_nrows = (src1_nrows > 1) ? 0 : 1; + + switch (kernel_type) { + case HTP_MM_KERNEL_HVX_F16_F16_VTCM: { + size_t f16_src1_row_size = htp_mm_round_up(ne10 * 2, 128); + vtcm_src1_size = htp_mm_round_up(f16_src1_row_size * src1_nrows, 256); + vtcm_src0_size = htp_mm_round_up(n_prefetch * src0_row_size_padded, 256) * n_threads; + vtcm_dst_size = dst_nrows > 0 ? htp_mm_round_up(dst_row_size, 128) * n_threads : 0; + break; + } + case HTP_MM_KERNEL_HVX_F16_F32_DDR: + case HTP_MM_KERNEL_HVX_F16_F16_DDR: + case HTP_MM_KERNEL_HVX_F32_F32_DDR: + case HTP_MM_KERNEL_HVX_F32_F16_DDR: { + vtcm_src0_size = htp_mm_round_up(n_prefetch * src0_row_size, 256) * n_threads; + vtcm_src1_size = htp_mm_round_up(n_prefetch * src1_row_size, 256) * n_threads; + vtcm_dst_size = dst_nrows > 0 ? htp_mm_round_up(dst_row_size, 128) * n_threads : 0; + break; + } + case HTP_MM_KERNEL_HVX_F32_F32_VTCM: { + size_t f32_src1_row_size = htp_mm_round_up(ne10 * 4, 128); + vtcm_src1_size = htp_mm_round_up(f32_src1_row_size * src1_nrows, 256); + vtcm_src0_size = htp_mm_round_up(n_prefetch * src0_row_size_padded, 256) * n_threads; + vtcm_dst_size = dst_nrows > 0 ? htp_mm_round_up(dst_row_size, 128) * n_threads : 0; + break; + } + case HTP_MM_KERNEL_HVX_QUANT_BLOCK: + case HTP_MM_KERNEL_HVX_QUANT_ROW: { + size_t q_src1_row_size = (wtype == HTP_TYPE_Q4_1) ? htp_mm_q8_1_tiled_row_size(ne10) : htp_mm_q8_0_tiled_row_size(ne10); + + vtcm_dst_size = dst_nrows > 0 ? htp_mm_round_up(dst_row_size, 128) : 0; + vtcm_src0_size = htp_mm_round_up(n_prefetch * src0_row_size_padded, 256); + vtcm_src1_size = htp_mm_round_up(q_src1_row_size * src1_nrows, 256); + + // src0 spad is also used in dynamic quantizer to store padded src1 rows + size_t src1_row_size_padded = htp_mm_round_up(q_src1_row_size, QK_Q8_0_TILED * sizeof(float)); + if (vtcm_src0_size < src1_row_size_padded) { + vtcm_src0_size = src1_row_size_padded; + } + + vtcm_src0_size = vtcm_src0_size * n_threads; + vtcm_dst_size = vtcm_dst_size * n_threads; + + if (is_repack) { + uint32_t aligned_tile_size = htp_mm_get_weight_aligned_tile_size(wtype); + uint32_t n_k_tiles = ne10 / 32; + uint32_t tile_row_size = n_k_tiles * aligned_tile_size; + size_t repacked_vtcm_size = htp_mm_round_up(n_prefetch * tile_row_size, 256); + if (repacked_vtcm_size < src1_row_size_padded) { + repacked_vtcm_size = src1_row_size_padded; + } + vtcm_src0_size = repacked_vtcm_size * n_threads; + } + break; + } + case HTP_MM_KERNEL_HVX_QUANT_ROW_FLAT: { + size_t q_src1_row_size = (wtype == HTP_TYPE_Q4_1) ? htp_mm_q8_1_flat_row_size(ne10) : htp_mm_q8_0_flat_row_size(ne10); + + vtcm_dst_size = dst_nrows > 0 ? htp_mm_round_up(dst_row_size, 128) : 0; + vtcm_src0_size = htp_mm_round_up(n_prefetch * src0_row_size_padded, 256); + vtcm_src1_size = htp_mm_round_up(q_src1_row_size * src1_nrows, 256); + + size_t src1_row_size_padded = htp_mm_round_up(q_src1_row_size, 256); + if (vtcm_src0_size < src1_row_size_padded) { + vtcm_src0_size = src1_row_size_padded; + } + + vtcm_src0_size = vtcm_src0_size * n_threads; + vtcm_dst_size = vtcm_dst_size * n_threads; + + if (is_repack) { + uint32_t aligned_tile_size = htp_mm_get_weight_aligned_tile_size(wtype); + uint32_t n_k_tiles = ne10 / 32; + uint32_t tile_row_size = n_k_tiles * aligned_tile_size; + size_t repacked_vtcm_size = htp_mm_round_up(n_prefetch * tile_row_size, 256); + if (repacked_vtcm_size < src1_row_size_padded) { + repacked_vtcm_size = src1_row_size_padded; + } + vtcm_src0_size = repacked_vtcm_size * n_threads; + } + break; + } + default: + break; + } + + *vtcm_src0_size_out = vtcm_src0_size; + *vtcm_src1_size_out = vtcm_src1_size; + *vtcm_dst_size_out = vtcm_dst_size; + + return vtcm_src0_size + vtcm_src1_size + vtcm_dst_size; +} + +static inline size_t htp_mm_hvx_id_get_vtcm_sizes( + int wtype, + uint32_t ne10, // k + uint32_t src1_nrows, + uint32_t n_threads, + size_t src0_row_size, // nb01 + uint32_t n_prefetch, + size_t * vtcm_src0_size_out, + size_t * vtcm_src1_size_out +) { + const bool is_repack = (wtype == HTP_TYPE_Q4_0 || wtype == HTP_TYPE_Q4_1 || + wtype == HTP_TYPE_Q8_0 || wtype == HTP_TYPE_IQ4_NL || + wtype == HTP_TYPE_MXFP4); + + const size_t src0_row_size_padded = htp_mm_round_up(src0_row_size, 128); + const size_t src1_row_size = (wtype == HTP_TYPE_Q4_1) ? htp_mm_q8_1_tiled_row_size(ne10) + : htp_mm_q8_0_tiled_row_size(ne10); + + size_t src0_sz_per_thread = htp_mm_round_up(n_prefetch * src0_row_size_padded, 256); + size_t src1_sz = htp_mm_round_up(src1_row_size * src1_nrows, 256); + + // src0 spad also holds temporary transposed src1 columns during dynamic quantization. + const size_t src1_row_size_padded = htp_mm_round_up(src1_row_size, QK_Q8_0_TILED * sizeof(float)); + if (src0_sz_per_thread < src1_row_size_padded) { + src0_sz_per_thread = src1_row_size_padded; + } + + if (is_repack) { + const uint32_t aligned_tile_size = htp_mm_get_weight_aligned_tile_size(wtype); + const uint32_t n_k_tiles = ne10 / 32; + const uint32_t tile_row_size = n_k_tiles * aligned_tile_size; + size_t repacked_vtcm_size = htp_mm_round_up(n_prefetch * tile_row_size, 256); + if (repacked_vtcm_size < src1_row_size_padded) { + repacked_vtcm_size = src1_row_size_padded; + } + src0_sz_per_thread = repacked_vtcm_size; + } + + const size_t vtcm_src0_size = src0_sz_per_thread * n_threads; + + *vtcm_src0_size_out = vtcm_src0_size; + *vtcm_src1_size_out = src1_sz; + + return vtcm_src0_size + src1_sz; +} + +#ifdef __cplusplus +} +#endif + +#endif // HTP_MATMUL_OPS_H diff --git a/ggml/src/ggml-hexagon/htp/ssm-conv.c b/ggml/src/ggml-hexagon/htp/ssm-conv.c index d574da2e2bc9..a48bc9ed86b2 100644 --- a/ggml/src/ggml-hexagon/htp/ssm-conv.c +++ b/ggml/src/ggml-hexagon/htp/ssm-conv.c @@ -183,24 +183,25 @@ static inline void hvx_transpose_32x32_f32(HVX_Vector m[32]) { // transposed into VTCM. // // VTCM layouts (per thread): -// src1_T : {d_inner_per_thread, d_conv} — staged once per launch (small). -// src0_T : {d_inner_tile, ncs} — staged per d_inner-tile. +// src1_T : {d_inner_stride, d_conv} - staged once per launch (small). +// src0_T : {d_inner_tile, ncs} - staged per d_inner-tile. // // d_inner_tile is chosen so that per-thread VTCM stays under the budget. // Each thread iterates ceil(d_inner_per_thread d_inner_tile) tiles serially. #define HTP_SSM_CONV_VTCM_BUDGET (1u << 20) // 1 MiB per thread -// Scalar transpose: src1 {d_conv, d_inner} (DDR) -> {d_inner_per_thread, d_conv} (VTCM) +// Scalar transpose: src1 {d_conv, d_inner} (DDR) -> {d_inner_stride, d_conv} (VTCM) static inline void transpose_src1(const float * src1_data, uint32_t src1_stride_inner, uint32_t i1_off, uint32_t d_inner_per_thread, + uint32_t d_inner_stride, uint32_t d_conv, float * src1_T) { for (uint32_t i = 0; i < d_inner_per_thread; ++i) { const float * src_row = src1_data + (i1_off + i) * src1_stride_inner; for (uint32_t j = 0; j < d_conv; ++j) { - src1_T[j * d_inner_per_thread + i] = src_row[j]; + src1_T[j * d_inner_stride + i] = src_row[j]; } } } @@ -280,6 +281,7 @@ static void ssm_conv_thread_f32_f32_hvx(unsigned int nth, unsigned int ith, void } const uint32_t d_inner_per_thread = ir1 - ir0; + const uint32_t d_inner_stride = scctx->nrows_per_thread; const uint32_t d_inner_tile = scctx->d_inner_tile; const float * src0_data = (const float *) src0->data; @@ -290,8 +292,8 @@ static void ssm_conv_thread_f32_f32_hvx(unsigned int nth, unsigned int ith, void float * src0_T = (float *)(octx->src0_spad.data + ith * octx->src0_spad.size_per_thread); float * src1_T = (float *)(octx->src1_spad.data + ith * octx->src1_spad.size_per_thread); - // Stage src1 weights once into VTCM in {d_inner_per_thread, d_conv} layout. - transpose_src1(src1_data, src1_stride_inner, ir0, d_inner_per_thread, d_conv, src1_T); + // Stage src1 weights once into VTCM in {d_inner_stride, d_conv} layout. + transpose_src1(src1_data, src1_stride_inner, ir0, d_inner_per_thread, d_inner_stride, d_conv, src1_T); const uint32_t C_TILE = VLEN_FP32; @@ -314,7 +316,7 @@ static void ssm_conv_thread_f32_f32_hvx(unsigned int nth, unsigned int ith, void HVX_Vector acc = hvx_vec_splat_f32(0.0f); for (uint32_t j = 0; j < d_conv; ++j) { HVX_Vector x = *(const HVX_Vector *) (src0_T + (t + j) * d_inner_tile + cb); - HVX_Vector w = *(const HVX_Vector *) (src1_T + j * d_inner_per_thread + tile_off + cb); + HVX_Vector w = *(const HVX_Vector *) (src1_T + j * d_inner_stride + tile_off + cb); acc = Q6_Vqf32_vadd_Vqf32Vqf32(acc, Q6_Vqf32_vmpy_VsfVsf(x, w)); } HVX_Vector res = Q6_Vsf_equals_Vqf32(acc); @@ -362,8 +364,7 @@ int op_ssm_conv_f32(struct htp_ops_context * octx) { use_hvx = 1; } - scctx.nrows_per_thread = (d_inner + n_threads - 1) / n_threads; - scctx.nrows_per_thread += (scctx.nrows_per_thread & 1); + scctx.nrows_per_thread = hex_round_up((d_inner + n_threads - 1) / n_threads, VLEN_FP32); const uint32_t d_inner_per_thread = scctx.nrows_per_thread; const uint32_t ncs = src0->ne[0]; diff --git a/ggml/src/ggml-hexagon/libggml-htp.inf b/ggml/src/ggml-hexagon/libggml-htp.inf index 39cefcdda381..874dde1b8878 100644 --- a/ggml/src/ggml-hexagon/libggml-htp.inf +++ b/ggml/src/ggml-hexagon/libggml-htp.inf @@ -14,8 +14,6 @@ Drivers_Dir = 13 1 = %DiskId% [SourceDisksFiles] -libggml-htp-v68.so = 1 -libggml-htp-v69.so = 1 libggml-htp-v73.so = 1 libggml-htp-v75.so = 1 libggml-htp-v79.so = 1 @@ -28,8 +26,6 @@ ExcludeFromSelect = * CopyFiles=Drivers_Dir [Drivers_Dir] -libggml-htp-v68.so,,,0x10 ;COPYFLG_NO_OVERWRITE -libggml-htp-v69.so,,,0x10 ;COPYFLG_NO_OVERWRITE libggml-htp-v73.so,,,0x10 ;COPYFLG_NO_OVERWRITE libggml-htp-v75.so,,,0x10 ;COPYFLG_NO_OVERWRITE libggml-htp-v79.so,,,0x10 ;COPYFLG_NO_OVERWRITE diff --git a/ggml/src/ggml-metal/ggml-metal-device.cpp b/ggml/src/ggml-metal/ggml-metal-device.cpp index 4f4f073cb614..0e1f1de4577d 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.cpp +++ b/ggml/src/ggml-metal/ggml-metal-device.cpp @@ -66,7 +66,6 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_base(ggml const char * op_str = "undefined"; switch (op) { case GGML_OP_ADD_ID: op_str = "add_id"; break; - case GGML_OP_CONCAT: op_str = "concat"; break; default: GGML_ABORT("fatal error"); }; @@ -211,6 +210,21 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_repeat(ggml_meta return res; } +ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_concat(ggml_metal_library_t lib, ggml_type tsrc) { + char base[256]; + char name[256]; + + snprintf(base, 256, "kernel_concat_%s", ggml_type_name(tsrc)); + snprintf(name, 256, "%s", base); + + ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name); + if (!res.pipeline) { + res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr); + } + + return res; +} + ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_unary(ggml_metal_library_t lib, const ggml_tensor * op) { char base[256]; char name[256]; @@ -1689,7 +1703,9 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_norm(ggml_metal_ } ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_rope(ggml_metal_library_t lib, const ggml_tensor * op) { - assert(op->op == GGML_OP_ROPE); + assert(op->op == GGML_OP_ROPE || op->op == GGML_OP_ROPE_BACK); + + const bool is_back = op->op == GGML_OP_ROPE_BACK; char base[256]; char name[256]; @@ -1713,13 +1729,14 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_rope(ggml_metal_ snprintf(base, 256, "kernel_rope_norm_%s", ggml_type_name(op->src[0]->type)); } - snprintf(name, 256, "%s_imrope=%d", base, is_imrope ? 1 : 0); + snprintf(name, 256, "%s_imrope=%d_is_back=%d", base, is_imrope ? 1 : 0, is_back ? 1 : 0); ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name); if (!res.pipeline) { ggml_metal_cv_t cv = ggml_metal_cv_init(); ggml_metal_cv_set_bool(cv, is_imrope, FC_ROPE + 0); + ggml_metal_cv_set_bool(cv, is_back, FC_ROPE + 1); res = ggml_metal_library_compile_pipeline(lib, base, name, cv); diff --git a/ggml/src/ggml-metal/ggml-metal-device.h b/ggml/src/ggml-metal/ggml-metal-device.h index 4a3ebb5569d9..d465f31c083b 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.h +++ b/ggml/src/ggml-metal/ggml-metal-device.h @@ -115,6 +115,7 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_get_rows struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_set_rows (ggml_metal_library_t lib, enum ggml_type tidx, enum ggml_type tdst); struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_diag (ggml_metal_library_t lib, const struct ggml_tensor * op); struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_repeat (ggml_metal_library_t lib, enum ggml_type tsrc); +struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_concat (ggml_metal_library_t lib, enum ggml_type tsrc); struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_unary (ggml_metal_library_t lib, const struct ggml_tensor * op); struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_glu (ggml_metal_library_t lib, const struct ggml_tensor * op); struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_sum (ggml_metal_library_t lib, const struct ggml_tensor * op); diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m index 05d7f43051ba..a7cbc60ebe41 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.m +++ b/ggml/src/ggml-metal/ggml-metal-device.m @@ -1120,8 +1120,28 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te case GGML_OP_VIEW: case GGML_OP_TRANSPOSE: case GGML_OP_PERMUTE: - case GGML_OP_CONCAT: return true; + case GGML_OP_CONCAT: + { + const enum ggml_type src0_type = op->src[0]->type; + const enum ggml_type src1_type = op->src[1]->type; + if (src0_type != src1_type || src0_type != op->type) { + return false; + } + switch (src0_type) { + case GGML_TYPE_F32: + case GGML_TYPE_F16: + case GGML_TYPE_I8: + case GGML_TYPE_I16: + case GGML_TYPE_I32: + case GGML_TYPE_I64: + return true; + case GGML_TYPE_BF16: + return has_bfloat; + default: + return false; + } + } case GGML_OP_ADD: case GGML_OP_SUB: case GGML_OP_MUL: @@ -1164,6 +1184,7 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te case GGML_OP_RMS_NORM: return has_simdgroup_reduction && (ggml_is_contiguous_rows(op->src[0])); case GGML_OP_ROPE: + case GGML_OP_ROPE_BACK: return true; case GGML_OP_IM2COL: return ggml_is_contiguous(op->src[1]) && op->src[1]->type == GGML_TYPE_F32 && (op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_F32); diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp index e2ce56e9e28b..18656b346f21 100644 --- a/ggml/src/ggml-metal/ggml-metal-ops.cpp +++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp @@ -375,6 +375,7 @@ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) { n_fuse = ggml_metal_op_norm(ctx, idx); } break; case GGML_OP_ROPE: + case GGML_OP_ROPE_BACK: { n_fuse = ggml_metal_op_rope(ctx, idx); } break; @@ -556,7 +557,7 @@ int ggml_metal_op_concat(ggml_metal_op_t ctx, int idx) { /*.dim =*/ dim, }; - auto pipeline = ggml_metal_library_get_pipeline_base(lib, GGML_OP_CONCAT); + auto pipeline = ggml_metal_library_get_pipeline_concat(lib, op->type); ggml_metal_encoder_set_pipeline(enc, pipeline); ggml_metal_encoder_set_bytes (enc, &args, sizeof(args), 0); diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index 0aea68455fba..25e78e100898 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -1418,6 +1418,9 @@ typedef decltype(kernel_repeat) kernel_repeat_t; template [[host_name("kernel_repeat_f32")]] kernel kernel_repeat_t kernel_repeat; template [[host_name("kernel_repeat_f16")]] kernel kernel_repeat_t kernel_repeat; +#if defined(GGML_METAL_HAS_BF16) +template [[host_name("kernel_repeat_bf16")]] kernel kernel_repeat_t kernel_repeat; +#endif template [[host_name("kernel_repeat_i32")]] kernel kernel_repeat_t kernel_repeat; template [[host_name("kernel_repeat_i16")]] kernel kernel_repeat_t kernel_repeat; @@ -4355,6 +4358,7 @@ template [[host_name("kernel_mul_mv_bf16_bf16_short")]] kernel mul_mv_t_t_short_ #endif constant bool FC_rope_is_imrope [[function_constant(FC_ROPE + 0)]]; +constant bool FC_rope_is_back [[function_constant(FC_ROPE + 1)]]; static float rope_yarn_ramp(const float low, const float high, const int i0) { const float y = (i0 / 2 - low) / max(0.001f, high - low); @@ -4378,6 +4382,9 @@ static void rope_yarn( } *cos_theta = cos(theta) * mscale; *sin_theta = sin(theta) * mscale; + if (FC_rope_is_back) { + *sin_theta *= -1.0f; + } } // Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get @@ -7510,14 +7517,15 @@ template [[host_name("kernel_cpy_q5_0_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32< template [[host_name("kernel_cpy_q5_1_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32; template [[host_name("kernel_cpy_q8_0_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32; +template kernel void kernel_concat( - constant ggml_metal_kargs_concat & args, - device const char * src0, - device const char * src1, - device char * dst, - uint3 tgpig[[threadgroup_position_in_grid]], - ushort3 tpitg[[thread_position_in_threadgroup]], - ushort3 ntg[[threads_per_threadgroup]]) { + constant ggml_metal_kargs_concat & args, + device const char * src0, + device const char * src1, + device char * dst, + uint3 tgpig[[threadgroup_position_in_grid]], + ushort3 tpitg[[thread_position_in_threadgroup]], + ushort3 ntg[[threads_per_threadgroup]]) { const int i3 = tgpig.z; const int i2 = tgpig.y; @@ -7530,21 +7538,33 @@ kernel void kernel_concat( int o[4] = {0, 0, 0, 0}; o[args.dim] = args.dim == 0 ? args.ne00 : (args.dim == 1 ? args.ne01 : (args.dim == 2 ? args.ne02 : args.ne03)); - device const float * x; - for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) { + device const T * x; + if (i0 < args.ne00 && i1 < args.ne01 && i2 < args.ne02 && i3 < args.ne03) { - x = (device const float *)(src0 + (i3 )*args.nb03 + (i2 )*args.nb02 + (i1 )*args.nb01 + (i0 )*args.nb00); + x = (device const T *)(src0 + (i3 )*args.nb03 + (i2 )*args.nb02 + (i1 )*args.nb01 + (i0 )*args.nb00); } else { - x = (device const float *)(src1 + (i3 - o[3])*args.nb13 + (i2 - o[2])*args.nb12 + (i1 - o[1])*args.nb11 + (i0 - o[0])*args.nb10); + x = (device const T *)(src1 + (i3 - o[3])*args.nb13 + (i2 - o[2])*args.nb12 + (i1 - o[1])*args.nb11 + (i0 - o[0])*args.nb10); } - device float * y = (device float *)(dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0); + device T * y = (device T *)(dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0); *y = *x; } } +typedef decltype(kernel_concat) kernel_concat_t; + +template [[host_name("kernel_concat_f32")]] kernel kernel_concat_t kernel_concat; +template [[host_name("kernel_concat_f16")]] kernel kernel_concat_t kernel_concat; +#if defined(GGML_METAL_HAS_BF16) +template [[host_name("kernel_concat_bf16")]] kernel kernel_concat_t kernel_concat; +#endif +template [[host_name("kernel_concat_i8")]] kernel kernel_concat_t kernel_concat; +template [[host_name("kernel_concat_i16")]] kernel kernel_concat_t kernel_concat; +template [[host_name("kernel_concat_i32")]] kernel kernel_concat_t kernel_concat; +template [[host_name("kernel_concat_i64")]] kernel kernel_concat_t kernel_concat; + template void kernel_mul_mv_q2_K_f32_impl( args_t args, diff --git a/ggml/src/ggml-opencl/CMakeLists.txt b/ggml/src/ggml-opencl/CMakeLists.txt index cd15d5732389..82ce61d72c6d 100644 --- a/ggml/src/ggml-opencl/CMakeLists.txt +++ b/ggml/src/ggml-opencl/CMakeLists.txt @@ -142,6 +142,10 @@ set(GGML_OPENCL_KERNELS gemm_noshuffle_q4_0_f32 gemv_noshuffle_q4_1_f32 gemm_noshuffle_q4_1_f32 + gemv_noshuffle_q5_0_f32 + gemm_noshuffle_q5_0_f32 + gemv_noshuffle_q5_1_f32 + gemm_noshuffle_q5_1_f32 gemv_noshuffle_iq4_nl_f32 gemm_noshuffle_iq4_nl_f32 gemv_noshuffle_q8_0_f32 diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp index d30579b94523..00f20b09b8fd 100644 --- a/ggml/src/ggml-opencl/ggml-opencl.cpp +++ b/ggml/src/ggml-opencl/ggml-opencl.cpp @@ -564,6 +564,9 @@ struct ggml_backend_opencl_context { cl_kernel kernel_mul_mat_f16_f32_1row; cl_kernel kernel_mul_mat_f16_f32; cl_kernel kernel_mul_mat_f16_f32_l4; + cl_kernel kernel_mul_mat_f16_f32_l4_dr; + cl_kernel kernel_mul_mat_f16_f32_l4_dr_ls; + cl_kernel kernel_mul_mat_f16_f32_l4_dr_lq; cl_kernel kernel_mul_mat_f16_f32_tiled; cl_kernel kernel_adreno_xmem_pack_src_f32; cl_kernel kernel_adreno_xmem_prepack_weight_f16; @@ -593,6 +596,10 @@ struct ggml_backend_opencl_context { cl_kernel kernel_restore_block_q4_0_noshuffle; cl_kernel kernel_convert_block_q4_1_noshuffle; cl_kernel kernel_restore_block_q4_1_noshuffle; + cl_kernel kernel_convert_block_q5_0_noshuffle; + cl_kernel kernel_restore_block_q5_0_noshuffle; + cl_kernel kernel_convert_block_q5_1_noshuffle; + cl_kernel kernel_restore_block_q5_1_noshuffle; cl_kernel kernel_convert_block_q4_K_noshuffle; cl_kernel kernel_restore_block_q4_K_noshuffle; cl_kernel kernel_convert_block_q4_K, kernel_restore_block_q4_K; @@ -829,6 +836,10 @@ struct ggml_backend_opencl_context { cl_kernel kernel_gemm_noshuffle_q6_K_f32; cl_kernel kernel_gemv_noshuffle_q5_k_f32; cl_kernel kernel_gemm_noshuffle_q5_k_f32; + cl_kernel kernel_gemv_noshuffle_q5_0_f32; + cl_kernel kernel_gemm_noshuffle_q5_0_f32; + cl_kernel kernel_gemv_noshuffle_q5_1_f32; + cl_kernel kernel_gemm_noshuffle_q5_1_f32; cl_kernel kernel_gemv_noshuffle_iq4_nl_f32; cl_kernel kernel_gemm_noshuffle_iq4_nl_f32; #endif // GGML_OPENCL_USE_ADRENO_KERNELS @@ -839,6 +850,7 @@ struct ggml_backend_opencl_context { ref_count--; if (ref_count == 0) { #ifdef GGML_OPENCL_PROFILING + flush_profiling_batch(); write_profiling_info(); profiling_results.clear(); #endif @@ -1152,6 +1164,10 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) { CL_CHECK((backend_ctx->kernel_restore_block_q4_1_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q4_1_trans4_ns", &err), err)); CL_CHECK((backend_ctx->kernel_convert_block_q5_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q5_0", &err), err)); CL_CHECK((backend_ctx->kernel_restore_block_q5_0 = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q5_0", &err), err)); + CL_CHECK((backend_ctx->kernel_convert_block_q5_0_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q5_0_noshuffle", &err), err)); + CL_CHECK((backend_ctx->kernel_restore_block_q5_0_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q5_0_noshuffle", &err), err)); + CL_CHECK((backend_ctx->kernel_convert_block_q5_1_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q5_1_noshuffle", &err), err)); + CL_CHECK((backend_ctx->kernel_restore_block_q5_1_noshuffle = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q5_1_noshuffle", &err), err)); CL_CHECK((backend_ctx->kernel_convert_block_q5_0_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q5_0_trans4_ns", &err), err)); CL_CHECK((backend_ctx->kernel_restore_block_q5_0_trans4_ns = clCreateKernel(backend_ctx->program_cvt, "kernel_restore_block_q5_0_trans4_ns", &err), err)); CL_CHECK((backend_ctx->kernel_convert_block_q5_1 = clCreateKernel(backend_ctx->program_cvt, "kernel_convert_block_q5_1", &err), err)); @@ -1775,6 +1791,11 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) { build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts); CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_l4 = clCreateKernel(backend_ctx->program_mul_mv_f16_f32_l4, "kernel_mul_mat_f16_f32_l4", &err), err)); + CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_l4_dr = clCreateKernel(backend_ctx->program_mul_mv_f16_f32_l4, "kernel_mul_mat_f16_f32_l4_dr", &err), err)); + if (backend_ctx->gpu_family == ADRENO) { + CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_l4_dr_ls = clCreateKernel(backend_ctx->program_mul_mv_f16_f32_l4, "kernel_mul_mat_f16_f32_l4_dr_ls", &err), err)); + CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_l4_dr_lq = clCreateKernel(backend_ctx->program_mul_mv_f16_f32_l4, "kernel_mul_mat_f16_f32_l4_dr_lq", &err), err)); + } GGML_LOG_CONT("."); } @@ -3065,6 +3086,80 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) { GGML_LOG_CONT("."); } + // gemm_noshuffle_q5_0_f32 + { +#ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src { + #include "gemm_noshuffle_q5_0_f32.cl.h" + }; +#else + const std::string kernel_src = read_file("gemm_noshuffle_q5_0_f32.cl"); +#endif + cl_program prog = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts); + CL_CHECK((backend_ctx->kernel_gemm_noshuffle_q5_0_f32 = clCreateKernel(prog, "kernel_gemm_noshuffle_q5_0_f32", &err), err)); + CL_CHECK(clReleaseProgram(prog)); + GGML_LOG_CONT("."); + } + + // gemv_noshuffle_q5_0_f32 + { + std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std + + " -cl-mad-enable "; + if (backend_ctx->has_vector_subgroup_broadcast) { + CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAST "; + } + +#ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src { + #include "gemv_noshuffle_q5_0_f32.cl.h" + }; +#else + const std::string kernel_src = read_file("gemv_noshuffle_q5_0_f32.cl"); +#endif + cl_program prog = build_program_from_source( + backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_gemv_compile_opts); + CL_CHECK((backend_ctx->kernel_gemv_noshuffle_q5_0_f32 = clCreateKernel(prog, "kernel_gemv_noshuffle_q5_0_f32", &err), err)); + CL_CHECK(clReleaseProgram(prog)); + GGML_LOG_CONT("."); + } + + // gemm_noshuffle_q5_1_f32 + { +#ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src { + #include "gemm_noshuffle_q5_1_f32.cl.h" + }; +#else + const std::string kernel_src = read_file("gemm_noshuffle_q5_1_f32.cl"); +#endif + cl_program prog = build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts); + CL_CHECK((backend_ctx->kernel_gemm_noshuffle_q5_1_f32 = clCreateKernel(prog, "kernel_gemm_noshuffle_q5_1_f32", &err), err)); + CL_CHECK(clReleaseProgram(prog)); + GGML_LOG_CONT("."); + } + + // gemv_noshuffle_q5_1_f32 + { + std::string CL_gemv_compile_opts = std::string("-cl-std=") + opencl_c_std + + " -cl-mad-enable "; + if (backend_ctx->has_vector_subgroup_broadcast) { + CL_gemv_compile_opts += " -DVECTOR_SUB_GROUP_BROADCAST "; + } + +#ifdef GGML_OPENCL_EMBED_KERNELS + const std::string kernel_src { + #include "gemv_noshuffle_q5_1_f32.cl.h" + }; +#else + const std::string kernel_src = read_file("gemv_noshuffle_q5_1_f32.cl"); +#endif + cl_program prog = build_program_from_source( + backend_ctx->context, backend_ctx->device, kernel_src.c_str(), CL_gemv_compile_opts); + CL_CHECK((backend_ctx->kernel_gemv_noshuffle_q5_1_f32 = clCreateKernel(prog, "kernel_gemv_noshuffle_q5_1_f32", &err), err)); + CL_CHECK(clReleaseProgram(prog)); + GGML_LOG_CONT("."); + } + // gemm_noshuffle_iq4_nl_f32 { #ifdef GGML_OPENCL_EMBED_KERNELS @@ -6107,15 +6202,16 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, return; } #endif // GGML_OPENCL_USE_ADRENO_KERNELS - cl_kernel kernel = backend_ctx->kernel_convert_block_q5_0; - cl_ulong n_blk = ggml_nelements(tensor)/ggml_blck_size(tensor->type); + +#ifdef GGML_OPENCL_USE_ADRENO_KERNELS + if (use_adreno_kernels(backend_ctx, tensor)) { + cl_kernel kernel = backend_ctx->kernel_convert_block_q5_0_noshuffle; CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device)); CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->qs)); CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->qh)); CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->d)); - CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong), &n_blk)); - size_t global_work_size[] = {(size_t)CEIL_DIV(n_blk, 64) * 64, 1, 1}; + size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1}; size_t local_work_size[] = {64, 1, 1}; cl_event evt; @@ -6124,7 +6220,39 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, CL_CHECK(clReleaseMemObject(data_device)); tensor->extra = extra; + + int M = tensor->ne[1]; + int K = tensor->ne[0]; + GGML_ASSERT(K % 32 == 0); + + // Transpose qs as ushort + transpose_2d_as_16b(backend_ctx, extra->qs, extra->qs, size_qs, K/4, M); + // Transpose qh as uchar + transpose_2d_as_8b(backend_ctx, extra->qh, extra->qh, size_qh, K/8, M); + // Transpose d as ushort + transpose_2d_as_16b(backend_ctx, extra->d, extra->d, size_d, K/32, M); + return; + } +#endif // GGML_OPENCL_USE_ADRENO_KERNELS + cl_kernel kernel = backend_ctx->kernel_convert_block_q5_0; + cl_ulong n_blk = ggml_nelements(tensor)/ggml_blck_size(tensor->type); + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->qs)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->qh)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->d)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong), &n_blk)); + + size_t global_work_size[] = {(size_t)CEIL_DIV(n_blk, 64) * 64, 1, 1}; + size_t local_work_size[] = {64, 1, 1}; + + cl_event evt; + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); + CL_CHECK(clWaitForEvents(1, &evt)); + CL_CHECK(clReleaseMemObject(data_device)); + + tensor->extra = extra; + return; } if (tensor->type == GGML_TYPE_Q5_1) { ggml_tensor_extra_cl * extra_orig = (ggml_tensor_extra_cl *)tensor->extra; @@ -6225,6 +6353,42 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, return; } #endif // GGML_OPENCL_USE_ADRENO_KERNELS + +#ifdef GGML_OPENCL_USE_ADRENO_KERNELS + if (use_adreno_kernels(backend_ctx, tensor)) { + cl_kernel kernel = backend_ctx->kernel_convert_block_q5_1_noshuffle; + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra->qs)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra->qh)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra->d)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra->m)); + + size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1}; + size_t local_work_size[] = {64, 1, 1}; + + cl_event evt; + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt)); + CL_CHECK(clWaitForEvents(1, &evt)); + CL_CHECK(clReleaseMemObject(data_device)); + + tensor->extra = extra; + + int M = tensor->ne[1]; + int K = tensor->ne[0]; + GGML_ASSERT(K % 32 == 0); + + // Transpose qs as ushort + transpose_2d_as_16b(backend_ctx, extra->qs, extra->qs, size_qs, K/4, M); + // Transpose qh as uchar + transpose_2d_as_8b(backend_ctx, extra->qh, extra->qh, size_qh, K/8, M); + // Transpose d as ushort + transpose_2d_as_16b(backend_ctx, extra->d, extra->d, size_d, K/32, M); + // Transpose m as ushort + transpose_2d_as_16b(backend_ctx, extra->m, extra->m, size_m, K/32, M); + + return; + } +#endif // GGML_OPENCL_USE_ADRENO_KERNELS cl_kernel kernel = backend_ctx->kernel_convert_block_q5_1; cl_ulong n_blk = ggml_nelements(tensor)/ggml_blck_size(tensor->type); CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_device)); @@ -7299,6 +7463,48 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, CL_CHECK(clReleaseMemObject(data_device)); return; } + if (use_adreno_kernels(backend_ctx, tensor)) { + ggml_cl_buffer buf_trans_qs; + ggml_cl_buffer buf_trans_qh; + ggml_cl_buffer buf_trans_d; + ggml_cl_buffer buf_unpacked; + + cl_int M = tensor->ne[1]; + cl_int K = tensor->ne[0]; + + GGML_ASSERT(K % 32 == 0); + + size_t size_qs = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*ggml_blck_size(tensor->type)/2; + size_t size_qh = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*sizeof(int32_t); + size_t size_d = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*sizeof(ggml_fp16_t); + + buf_trans_qs.allocate(backend_ctx->context, size_qs); + buf_trans_qh.allocate(backend_ctx->context, size_qh); + buf_trans_d.allocate(backend_ctx->context, size_d); + buf_unpacked.allocate(backend_ctx->context, ggml_nbytes(tensor)); + + transpose_2d_as_16b(backend_ctx, extra->qs, buf_trans_qs.buffer, size_qs, M, K/4); + transpose_2d_as_8b(backend_ctx, extra->qh, buf_trans_qh.buffer, size_qh, M, K/8); + transpose_2d_as_16b(backend_ctx, extra->d, buf_trans_d.buffer, size_d, M, K/32); + + cl_uchar mask_0F = 0x0F; + cl_uchar mask_F0 = 0xF0; + + size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1}; + size_t local_work_size[] = {1, 1, 1}; + + cl_kernel kernel = backend_ctx->kernel_restore_block_q5_0_noshuffle; + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf_trans_qs.buffer)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_trans_qh.buffer)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &buf_trans_d.buffer)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &buf_unpacked.buffer)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_uchar), &mask_0F)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_uchar), &mask_F0)); + + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); + CL_CHECK(clEnqueueReadBuffer(queue, buf_unpacked.buffer, CL_TRUE, offset, size, data, 0, NULL, NULL)); + return; + } #endif // GGML_OPENCL_USE_ADRENO_KERNELS cl_int err; @@ -7362,6 +7568,54 @@ static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, CL_CHECK(clReleaseMemObject(data_device)); return; } + + if (use_adreno_kernels(backend_ctx, tensor)) { + ggml_cl_buffer buf_trans_qs; + ggml_cl_buffer buf_trans_qh; + ggml_cl_buffer buf_trans_d; + ggml_cl_buffer buf_trans_m; + ggml_cl_buffer buf_unpacked; + + cl_int M = tensor->ne[1]; + cl_int K = tensor->ne[0]; + GGML_ASSERT(K % 32 == 0); + + size_t size_qs = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*ggml_blck_size(tensor->type)/2; + size_t size_qh = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*sizeof(int32_t); + size_t size_d = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*sizeof(ggml_fp16_t); + size_t size_m = (ggml_nelements(tensor)/ggml_blck_size(tensor->type))*sizeof(ggml_fp16_t); + + buf_trans_qs.allocate(backend_ctx->context, size_qs); + buf_trans_qh.allocate(backend_ctx->context, size_qh); + buf_trans_d.allocate(backend_ctx->context, size_d); + buf_trans_m.allocate(backend_ctx->context, size_m); + buf_unpacked.allocate(backend_ctx->context, ggml_nbytes(tensor)); + + // Transpose back: from col-major to row-major + transpose_2d_as_16b(backend_ctx, extra->qs, buf_trans_qs.buffer, size_qs, M, K/4); + transpose_2d_as_8b(backend_ctx, extra->qh, buf_trans_qh.buffer, size_qh, M, K/8); + transpose_2d_as_16b(backend_ctx, extra->d, buf_trans_d.buffer, size_d, M, K/32); + transpose_2d_as_16b(backend_ctx, extra->m, buf_trans_m.buffer, size_m, M, K/32); + + cl_uchar mask_0F = 0x0F; + cl_uchar mask_F0 = 0xF0; + + size_t global_work_size[] = {(size_t)ggml_nelements(tensor)/ggml_blck_size(tensor->type), 1, 1}; + size_t local_work_size[] = {1, 1, 1}; + + cl_kernel kernel = backend_ctx->kernel_restore_block_q5_1_noshuffle; + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &buf_trans_qs.buffer)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &buf_trans_qh.buffer)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &buf_trans_d.buffer)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &buf_trans_m.buffer)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &buf_unpacked.buffer)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_uchar), &mask_0F)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_uchar), &mask_F0)); + + CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL)); + CL_CHECK(clEnqueueReadBuffer(queue, buf_unpacked.buffer, CL_TRUE, offset, size, data, 0, NULL, NULL)); + return; + } #endif // GGML_OPENCL_USE_ADRENO_KERNELS cl_int err; cl_mem data_device = clCreateBuffer(context, CL_MEM_READ_WRITE, @@ -9899,14 +10153,8 @@ static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const float eps; memcpy(&eps, dst->op_params, sizeof(float)); - const int ne00 = src0 ? src0->ne[0] : 0; - const int ne01 = src0 ? src0->ne[1] : 0; - const int ne02 = src0 ? src0->ne[2] : 0; - const int ne03 = src0 ? src0->ne[3] : 0; - - const cl_ulong nb01 = src0 ? src0->nb[1] : 0; - const cl_ulong nb02 = src0 ? src0->nb[2] : 0; - const cl_ulong nb03 = src0 ? src0->nb[3] : 0; + GGML_TENSOR_LOCALS(int, ne0, src0, ne); + GGML_TENSOR_LOCALS(cl_ulong, nb0, src0, nb); const int nth = MIN(64, ne00); @@ -9920,11 +10168,12 @@ static void ggml_cl_norm(ggml_backend_t backend, const ggml_tensor * src0, const CL_CHECK(clSetKernelArg(kernel, 5, sizeof(int), &ne01)); CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne02)); CL_CHECK(clSetKernelArg(kernel, 7, sizeof(int), &ne03)); - CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb01)); - CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb02)); - CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb03)); - CL_CHECK(clSetKernelArg(kernel, 11, sizeof(float), &eps)); - CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float)*nth, NULL)); + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb00)); + CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb01)); + CL_CHECK(clSetKernelArg(kernel, 10, sizeof(cl_ulong), &nb02)); + CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb03)); + CL_CHECK(clSetKernelArg(kernel, 12, sizeof(float), &eps)); + CL_CHECK(clSetKernelArg(kernel, 13, sizeof(float)*nth, NULL)); size_t global_work_size[] = {(size_t)ne01*nth, (size_t)ne02, (size_t)ne03}; size_t local_work_size[] = {(size_t)nth, 1, 1}; @@ -12205,7 +12454,7 @@ static void ggml_cl_mul_mat_q4_1_f32_adreno(ggml_backend_t backend, const ggml_t #endif } -static void ggml_cl_mul_mat_iq4_nl_f32_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cl_mul_mat_q5_0_f32_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { #ifdef GGML_OPENCL_USE_ADRENO_KERNELS GGML_ASSERT(src0); GGML_ASSERT(src0->extra); @@ -12218,17 +12467,17 @@ static void ggml_cl_mul_mat_iq4_nl_f32_adreno(ggml_backend_t backend, const ggml ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; - ggml_tensor_extra_cl_iq4_nl * extra0_iq4_nl = (ggml_tensor_extra_cl_iq4_nl *)src0->extra; + ggml_tensor_extra_cl_q5_0 * extra0_q5_0 = (ggml_tensor_extra_cl_q5_0 *)src0->extra; cl_ulong offset1 = extra1->offset + src1->view_offs; cl_ulong offsetd = extrad->offset + dst->view_offs; - const int ne00 = src0->ne[0]; - const int ne01 = src0->ne[1]; + const int ne00 = src0->ne[0]; + const int ne01 = src0->ne[1]; - const int ne1 = dst->ne[1]; + const int ne1 = dst->ne[1]; - GGML_ASSERT(ne00 % 32 == 0); + GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0); cl_context context = backend_ctx->context; cl_kernel kernel; @@ -12243,17 +12492,17 @@ static void ggml_cl_mul_mat_iq4_nl_f32_adreno(ggml_backend_t backend, const ggml int K = ne00; if (ne1 == 1) { - cl_mem q_img = nullptr; + cl_mem qs_img = nullptr; cl_mem b_sub_buf = nullptr; cl_mem b_img = nullptr; - // image for q - img_fmt = { CL_R, CL_UNSIGNED_INT32}; + // image for qs + img_fmt = { CL_R, CL_UNSIGNED_INT32 }; memset(&img_desc, 0, sizeof(img_desc)); img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; img_desc.image_width = M * K / 2 / 4; - img_desc.buffer = extra0_iq4_nl->q; - CL_CHECK((q_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err)); + img_desc.buffer = extra0_q5_0->qs; + CL_CHECK((qs_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err)); // subbuffer for activations region.origin = offset1; @@ -12268,22 +12517,23 @@ static void ggml_cl_mul_mat_iq4_nl_f32_adreno(ggml_backend_t backend, const ggml img_desc.buffer = b_sub_buf; CL_CHECK((b_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err)); - kernel = backend_ctx->kernel_gemv_noshuffle_iq4_nl_f32; + kernel = backend_ctx->kernel_gemv_noshuffle_q5_0_f32; - CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q_img)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_iq4_nl->d)); - CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &b_img)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extrad->data_device)); - CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong), &offsetd)); - CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_int), &ne00)); - CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &qs_img)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q5_0->qh)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q5_0->d)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &b_img)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_int), &ne01)); size_t local_work_size[3] = {64, 4, 1}; size_t global_work_size[3] = {(size_t)CEIL_DIV(ne01/2, 64)*64, 4, 1}; backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); - CL_CHECK(clReleaseMemObject(q_img)); + CL_CHECK(clReleaseMemObject(qs_img)); CL_CHECK(clReleaseMemObject(b_sub_buf)); CL_CHECK(clReleaseMemObject(b_img)); } else { @@ -12291,6 +12541,7 @@ static void ggml_cl_mul_mat_iq4_nl_f32_adreno(ggml_backend_t backend, const ggml cl_mem b_sub_buf_trans = nullptr; cl_mem b_img = nullptr; cl_mem b_img_trans = nullptr; + cl_mem d_sub_buf = nullptr; // subbuffer for activations region.origin = offset1; @@ -12326,6 +12577,11 @@ static void ggml_cl_mul_mat_iq4_nl_f32_adreno(ggml_backend_t backend, const ggml img_desc.buffer = b_sub_buf_trans; CL_CHECK((b_img_trans = clCreateImage(context, 0, &img_fmt, &img_desc, NULL, &err), err)); + // subbuffer for output + region.origin = extrad->offset; + region.size = M * N * sizeof(float); + CL_CHECK((d_sub_buf = clCreateSubBuffer(extrad->data_device, CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err)); + // transpose activations int height_B = N/4; if (height_B == 0) { @@ -12346,14 +12602,14 @@ static void ggml_cl_mul_mat_iq4_nl_f32_adreno(ggml_backend_t backend, const ggml backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size_t, local_work_size_t, dst); // gemm - kernel = backend_ctx->kernel_gemm_noshuffle_iq4_nl_f32; + kernel = backend_ctx->kernel_gemm_noshuffle_q5_0_f32; int padded_N = N + padding; - CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_iq4_nl->q)); - CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_iq4_nl->d)); - CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &b_img_trans)); - CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extrad->data_device)); - CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q5_0->qs)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q5_0->qh)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q5_0->d)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &b_img_trans)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &d_sub_buf)); CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_int), &ne01)); CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_int), &padded_N)); CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_int), &ne00)); @@ -12368,6 +12624,7 @@ static void ggml_cl_mul_mat_iq4_nl_f32_adreno(ggml_backend_t backend, const ggml CL_CHECK(clReleaseMemObject(b_sub_buf_trans)); CL_CHECK(clReleaseMemObject(b_img)); CL_CHECK(clReleaseMemObject(b_img_trans)); + CL_CHECK(clReleaseMemObject(d_sub_buf)); } #else GGML_UNUSED(backend); @@ -12377,7 +12634,7 @@ static void ggml_cl_mul_mat_iq4_nl_f32_adreno(ggml_backend_t backend, const ggml #endif } -static void ggml_cl_mul_mat_q8_0_f32_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cl_mul_mat_q5_1_f32_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { #ifdef GGML_OPENCL_USE_ADRENO_KERNELS GGML_ASSERT(src0); GGML_ASSERT(src0->extra); @@ -12386,24 +12643,378 @@ static void ggml_cl_mul_mat_q8_0_f32_adreno(ggml_backend_t backend, const ggml_t GGML_ASSERT(dst); GGML_ASSERT(dst->extra); - GGML_ASSERT(src0->type == GGML_TYPE_Q8_0); - GGML_ASSERT(src1->type == GGML_TYPE_F32); - ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; - ggml_tensor_extra_cl_q8_0 * extra0_q8_0 = (ggml_tensor_extra_cl_q8_0 *)src0->extra; + ggml_tensor_extra_cl_q5_1 * extra0_q5_1 = (ggml_tensor_extra_cl_q5_1 *)src0->extra; cl_ulong offset1 = extra1->offset + src1->view_offs; cl_ulong offsetd = extrad->offset + dst->view_offs; - GGML_ASSERT(src1->view_offs == 0); - GGML_ASSERT(dst->view_offs == 0); + const int ne00 = src0->ne[0]; + const int ne01 = src0->ne[1]; - const int ne00 = src0->ne[0]; - const int ne01 = src0->ne[1]; - const int ne02 = src0->ne[2]; + const int ne1 = dst->ne[1]; + + GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0); + + cl_context context = backend_ctx->context; + cl_kernel kernel; + + cl_int err; + cl_image_format img_fmt; + cl_image_desc img_desc; + cl_buffer_region region; + + int M = ne01; + int N = ne1; + int K = ne00; + + if (ne1 == 1) { + cl_mem qs_img = nullptr; + cl_mem b_sub_buf = nullptr; + cl_mem b_img = nullptr; + + // image for qs + img_fmt = { CL_R, CL_UNSIGNED_INT32 }; + memset(&img_desc, 0, sizeof(img_desc)); + img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; + img_desc.image_width = M * K / 2 / 4; + img_desc.buffer = extra0_q5_1->qs; + CL_CHECK((qs_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err)); + + // subbuffer for activations + region.origin = offset1; + region.size = K * N * sizeof(float); + CL_CHECK((b_sub_buf = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err)); + + // image for activations + img_fmt = {CL_RGBA, CL_FLOAT}; + memset(&img_desc, 0, sizeof(img_desc)); + img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; + img_desc.image_width = K * N / 4; + img_desc.buffer = b_sub_buf; + CL_CHECK((b_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err)); + + kernel = backend_ctx->kernel_gemv_noshuffle_q5_1_f32; + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &qs_img)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q5_1->qh)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q5_1->d)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra0_q5_1->m)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &b_img)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), &extrad->data_device)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_ulong), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_int), &ne01)); + + size_t local_work_size[3] = {64, 4, 1}; + size_t global_work_size[3] = {(size_t)CEIL_DIV(ne01/2, 64)*64, 4, 1}; + + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + + CL_CHECK(clReleaseMemObject(qs_img)); + CL_CHECK(clReleaseMemObject(b_sub_buf)); + CL_CHECK(clReleaseMemObject(b_img)); + } else { + cl_mem b_sub_buf = nullptr; + cl_mem b_sub_buf_trans = nullptr; + cl_mem b_img = nullptr; + cl_mem b_img_trans = nullptr; + cl_mem d_sub_buf = nullptr; + + // subbuffer for activations + region.origin = offset1; + region.size = K * N * sizeof(float); + CL_CHECK((b_sub_buf = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err)); + + // image for activations + img_fmt = {CL_RGBA, CL_FLOAT}; + memset(&img_desc, 0, sizeof(img_desc)); + img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; + img_desc.image_width = K * N / 4; + img_desc.buffer = b_sub_buf; + CL_CHECK((b_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err)); + + // pad N to multiple of 8 + int extra_elements = N % 8; + int padding = 0; + if (extra_elements > 0){ + padding = 8 - extra_elements; + } + + // subbuffer for transposed activations + region.origin = 0; + region.size = K * (N + padding) * sizeof(float)/2; + backend_ctx->prealloc_act_trans.allocate(context, region.size); + CL_CHECK((b_sub_buf_trans = clCreateSubBuffer(backend_ctx->prealloc_act_trans.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err)); + + // image for transposed activations + img_fmt = {CL_RGBA, CL_HALF_FLOAT}; + memset(&img_desc, 0, sizeof(img_desc)); + img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; + img_desc.image_width = K * (N + padding) / 4; + img_desc.buffer = b_sub_buf_trans; + CL_CHECK((b_img_trans = clCreateImage(context, 0, &img_fmt, &img_desc, NULL, &err), err)); + + // subbuffer for output + region.origin = extrad->offset; + region.size = M * N * sizeof(float); + CL_CHECK((d_sub_buf = clCreateSubBuffer(extrad->data_device, CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err)); + + // transpose activations + int height_B = N/4; + if (height_B == 0) { + height_B = 1; + } + int width_B = K/4; + int padded_height_B = (N + padding)/4; + + kernel = backend_ctx->kernel_transpose_32_16; + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &b_img)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &b_img_trans)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_B)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_B)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &padded_height_B)); + + size_t local_work_size_t[2] = { 1, 16 }; + size_t global_work_size_t[2] = { (size_t)width_B, (size_t)padded_height_B }; + backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size_t, local_work_size_t, dst); + + // gemm + kernel = backend_ctx->kernel_gemm_noshuffle_q5_1_f32; + int padded_N = N + padding; + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q5_1->qs)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q5_1->qh)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra0_q5_1->d)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extra0_q5_1->m)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &b_img_trans)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_mem), &d_sub_buf)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_int), &padded_N)); + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_int), &ne1)); + + size_t global_work_size[3] = {(size_t)CEIL_DIV(ne1, 8), (size_t)CEIL_DIV(ne01, 4), 1}; + size_t local_work_size[3] = {1, 128, 1}; + + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + + CL_CHECK(clReleaseMemObject(b_sub_buf)); + CL_CHECK(clReleaseMemObject(b_sub_buf_trans)); + CL_CHECK(clReleaseMemObject(b_img)); + CL_CHECK(clReleaseMemObject(b_img_trans)); + CL_CHECK(clReleaseMemObject(d_sub_buf)); + } +#else + GGML_UNUSED(backend); + GGML_UNUSED(src0); + GGML_UNUSED(src1); + GGML_UNUSED(dst); +#endif +} + +static void ggml_cl_mul_mat_iq4_nl_f32_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +#ifdef GGML_OPENCL_USE_ADRENO_KERNELS + GGML_ASSERT(src0); + GGML_ASSERT(src0->extra); + GGML_ASSERT(src1); + GGML_ASSERT(src1->extra); + GGML_ASSERT(dst); + GGML_ASSERT(dst->extra); + + ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; + + ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; + ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; + ggml_tensor_extra_cl_iq4_nl * extra0_iq4_nl = (ggml_tensor_extra_cl_iq4_nl *)src0->extra; + + cl_ulong offset1 = extra1->offset + src1->view_offs; + cl_ulong offsetd = extrad->offset + dst->view_offs; + + const int ne00 = src0->ne[0]; + const int ne01 = src0->ne[1]; + + const int ne1 = dst->ne[1]; + + GGML_ASSERT(ne00 % 32 == 0); + + cl_context context = backend_ctx->context; + cl_kernel kernel; + + cl_int err; + cl_image_format img_fmt; + cl_image_desc img_desc; + cl_buffer_region region; + + int M = ne01; + int N = ne1; + int K = ne00; + + if (ne1 == 1) { + cl_mem q_img = nullptr; + cl_mem b_sub_buf = nullptr; + cl_mem b_img = nullptr; + + // image for q + img_fmt = { CL_R, CL_UNSIGNED_INT32}; + memset(&img_desc, 0, sizeof(img_desc)); + img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; + img_desc.image_width = M * K / 2 / 4; + img_desc.buffer = extra0_iq4_nl->q; + CL_CHECK((q_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err)); + + // subbuffer for activations + region.origin = offset1; + region.size = K * N * sizeof(float); + CL_CHECK((b_sub_buf = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err)); + + // image for activations + img_fmt = {CL_RGBA, CL_FLOAT}; + memset(&img_desc, 0, sizeof(img_desc)); + img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; + img_desc.image_width = K * N / 4; + img_desc.buffer = b_sub_buf; + CL_CHECK((b_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err)); + + kernel = backend_ctx->kernel_gemv_noshuffle_iq4_nl_f32; + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &q_img)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_iq4_nl->d)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &b_img)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extrad->data_device)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_int), &ne01)); + + size_t local_work_size[3] = {64, 4, 1}; + size_t global_work_size[3] = {(size_t)CEIL_DIV(ne01/2, 64)*64, 4, 1}; + + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + + CL_CHECK(clReleaseMemObject(q_img)); + CL_CHECK(clReleaseMemObject(b_sub_buf)); + CL_CHECK(clReleaseMemObject(b_img)); + } else { + cl_mem b_sub_buf = nullptr; + cl_mem b_sub_buf_trans = nullptr; + cl_mem b_img = nullptr; + cl_mem b_img_trans = nullptr; + + // subbuffer for activations + region.origin = offset1; + region.size = K * N * sizeof(float); + CL_CHECK((b_sub_buf = clCreateSubBuffer(extra1->data_device, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err)); + + // image for activations + img_fmt = {CL_RGBA, CL_FLOAT}; + memset(&img_desc, 0, sizeof(img_desc)); + img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; + img_desc.image_width = K * N / 4; + img_desc.buffer = b_sub_buf; + CL_CHECK((b_img = clCreateImage(context, CL_MEM_READ_ONLY, &img_fmt, &img_desc, NULL, &err), err)); + + // pad N to multiple of 8 + int extra_elements = N % 8; + int padding = 0; + if (extra_elements > 0){ + padding = 8 - extra_elements; + } + + // subbuffer for transposed activations + region.origin = 0; + region.size = K * (N + padding) * sizeof(float)/2; + backend_ctx->prealloc_act_trans.allocate(context, region.size); + CL_CHECK((b_sub_buf_trans = clCreateSubBuffer(backend_ctx->prealloc_act_trans.buffer, 0, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err), err)); + + // image for transposed activations + img_fmt = {CL_RGBA, CL_HALF_FLOAT}; + memset(&img_desc, 0, sizeof(img_desc)); + img_desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; + img_desc.image_width = K * (N + padding) / 4; + img_desc.buffer = b_sub_buf_trans; + CL_CHECK((b_img_trans = clCreateImage(context, 0, &img_fmt, &img_desc, NULL, &err), err)); + + // transpose activations + int height_B = N/4; + if (height_B == 0) { + height_B = 1; + } + int width_B = K/4; + int padded_height_B = (N + padding)/4; + + kernel = backend_ctx->kernel_transpose_32_16; + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &b_img)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &b_img_trans)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), &height_B)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(int), &width_B)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(int), &padded_height_B)); + + size_t local_work_size_t[2] = { 1, 16 }; + size_t global_work_size_t[2] = { (size_t)width_B, (size_t)padded_height_B }; + backend_ctx->enqueue_ndrange_kernel(kernel, 2, global_work_size_t, local_work_size_t, dst); + + // gemm + kernel = backend_ctx->kernel_gemm_noshuffle_iq4_nl_f32; + int padded_N = N + padding; + + CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_iq4_nl->q)); + CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_iq4_nl->d)); + CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &b_img_trans)); + CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_mem), &extrad->data_device)); + CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_ulong), &offsetd)); + CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_int), &ne01)); + CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_int), &padded_N)); + CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_int), &ne00)); + CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_int), &ne1)); + + size_t global_work_size[3] = {(size_t)CEIL_DIV(ne1, 8), (size_t)CEIL_DIV(ne01, 4), 1}; + size_t local_work_size[3] = {1, 128, 1}; + + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + + CL_CHECK(clReleaseMemObject(b_sub_buf)); + CL_CHECK(clReleaseMemObject(b_sub_buf_trans)); + CL_CHECK(clReleaseMemObject(b_img)); + CL_CHECK(clReleaseMemObject(b_img_trans)); + } +#else + GGML_UNUSED(backend); + GGML_UNUSED(src0); + GGML_UNUSED(src1); + GGML_UNUSED(dst); +#endif +} + +static void ggml_cl_mul_mat_q8_0_f32_adreno(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +#ifdef GGML_OPENCL_USE_ADRENO_KERNELS + GGML_ASSERT(src0); + GGML_ASSERT(src0->extra); + GGML_ASSERT(src1); + GGML_ASSERT(src1->extra); + GGML_ASSERT(dst); + GGML_ASSERT(dst->extra); + + GGML_ASSERT(src0->type == GGML_TYPE_Q8_0); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + + ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context; + + ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra; + ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra; + ggml_tensor_extra_cl_q8_0 * extra0_q8_0 = (ggml_tensor_extra_cl_q8_0 *)src0->extra; + + cl_ulong offset1 = extra1->offset + src1->view_offs; + cl_ulong offsetd = extrad->offset + dst->view_offs; + + GGML_ASSERT(src1->view_offs == 0); + GGML_ASSERT(dst->view_offs == 0); + + const int ne00 = src0->ne[0]; + const int ne01 = src0->ne[1]; + const int ne02 = src0->ne[2]; const int ne10 = src1->ne[0]; const int ne12 = src1->ne[2]; @@ -13243,6 +13854,18 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co return; } + // q5_0 x fp32 + if (src0t == GGML_TYPE_Q5_0 && src1t == GGML_TYPE_F32) { + ggml_cl_mul_mat_q5_0_f32_adreno(backend, src0, src1, dst); + return; + } + + // q5_1 x fp32 + if (src0t == GGML_TYPE_Q5_1 && src1t == GGML_TYPE_F32) { + ggml_cl_mul_mat_q5_1_f32_adreno(backend, src0, src1, dst); + return; + } + // iq4_nl x fp32 if (src0t == GGML_TYPE_IQ4_NL && src1t == GGML_TYPE_F32) { ggml_cl_mul_mat_iq4_nl_f32_adreno(backend, src0, src1, dst); @@ -13951,11 +14574,31 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co } if (src1t == GGML_TYPE_F32) { + // heuristic for packing more work for Adreno + const bool adreno_use_lane_split = + backend_ctx->gpu_family == ADRENO && + ne11 == 1 && + ne01 >= 8 && + ne00 % 4 == 0 && + r3 == 1 && r2 >= 1 && r2 <= 8 && + (ne12 % r2) == 0; + if (ne11 * ne12 < 4) { kernel = backend_ctx->kernel_mul_mat_f16_f32_1row; + } else if (adreno_use_lane_split && ne00 >= 64 && ne00 <= 128) { + kernel = backend_ctx->kernel_mul_mat_f16_f32_l4_dr_lq; + nrows = 1; + } else if (adreno_use_lane_split && r2 >= 2 && ne00 > 128 && ne00 <= 256) { + kernel = backend_ctx->kernel_mul_mat_f16_f32_l4_dr_ls; + nrows = 1; } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) { - kernel = backend_ctx->kernel_mul_mat_f16_f32_l4; - nrows = ne11; + if (ne11 == 1) { + kernel = backend_ctx->kernel_mul_mat_f16_f32_l4_dr; + nrows = 1; // not used by this kernel + } else { + kernel = backend_ctx->kernel_mul_mat_f16_f32_l4; + nrows = ne11; + } } else { kernel = backend_ctx->kernel_mul_mat_f16_f32; nrows = 4; @@ -14734,12 +15377,30 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); } else { - int64_t ny = (ne11 + nrows - 1)/nrows; + if (kernel == backend_ctx->kernel_mul_mat_f16_f32_l4_dr) { + const int NDST_DR = 4; + size_t global_work_size[] = {(size_t)CEIL_DIV(ne01, NDST_DR)*nth0, (size_t)nth1, (size_t)ne12*ne13}; + size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1}; - size_t global_work_size[] = {(size_t)ne01*nth0, (size_t)ny*nth1, (size_t)ne12*ne13}; - size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1}; + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + } else if (kernel == backend_ctx->kernel_mul_mat_f16_f32_l4_dr_ls) { + size_t global_work_size[] = {(size_t)CEIL_DIV(ne01, 2)*nth0, (size_t)nth1, (size_t)ne02*ne03}; + size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1}; - backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + } else if (kernel == backend_ctx->kernel_mul_mat_f16_f32_l4_dr_lq) { + size_t global_work_size[] = {(size_t)CEIL_DIV(ne01, 4)*nth0, (size_t)nth1, (size_t)ne02*ne03}; + size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1}; + + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + } else { + int64_t ny = (ne11 + nrows - 1)/nrows; + + size_t global_work_size[] = {(size_t)ne01*nth0, (size_t)ny*nth1, (size_t)ne12*ne13}; + size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1}; + + backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst); + } } } diff --git a/ggml/src/ggml-opencl/kernels/cvt.cl b/ggml/src/ggml-opencl/kernels/cvt.cl index d07f0a1a025c..226b127ab3be 100644 --- a/ggml/src/ggml-opencl/kernels/cvt.cl +++ b/ggml/src/ggml-opencl/kernels/cvt.cl @@ -584,6 +584,60 @@ kernel void kernel_restore_block_q5_0( } } +kernel void kernel_convert_block_q5_0_noshuffle( + global struct block_q5_0 * src0, + global uchar * dst_q, + global uint * dst_qh, + global half * dst_d +) { + global struct block_q5_0 * b = (global struct block_q5_0 *) src0 + get_global_id(0); + global uchar * q = (global uchar *) dst_q + QK5_0/2*get_global_id(0); + global uint * qh = (global uint *) dst_qh + get_global_id(0); + global half * d = (global half *) dst_d + get_global_id(0); + + *d = b->d; + *qh = *((global uint *)(b->qh)); + + for (int i = 0; i < QK5_0/4; ++i) { + uchar x0 = b->qs[2*i + 0]; + uchar x1 = b->qs[2*i + 1]; + + q[i + 0 ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4); + q[i + QK5_0/4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0); + +#ifdef ADRENO_GPU + if (get_global_id(0) == 65536*4096) { + printf("%04x - %02x\n", *(global ushort*)d, ((x0 & 0xF0) >> 4) | (x1 & 0xF0)); + } +#endif + } +} + +kernel void kernel_restore_block_q5_0_noshuffle( + global uchar * src_q, + global uint * src_qh, + global half * src_d, + global struct block_q5_0 * dst, + uchar mask_0F, + uchar mask_F0 +) { + global struct block_q5_0 * b = (global struct block_q5_0 *) dst + get_global_id(0); + global uchar * q = (global uchar *) src_q + QK5_0/2*get_global_id(0); + global uint * qh = (global uint *) src_qh + get_global_id(0); + global half * d = (global half *) src_d + get_global_id(0); + + b->d = *d; + *((global uint *)(b->qh)) = *qh; + + for (int i = 0; i < QK5_0/4; ++i) { + uchar x0 = q[i + 0 ]; + uchar x1 = q[i + QK5_0/4]; + + b->qs[2*i + 0] = convert_uchar((x0 & mask_0F) | ((x1 & mask_0F) << 4)); + b->qs[2*i + 1] = convert_uchar(((x0 & mask_F0) >> 4) | (x1 & mask_F0)); + } +} + kernel void kernel_convert_block_q5_0_trans4_ns( __global struct block_q5_0 * src0, __global uint * dst_qs, @@ -736,6 +790,66 @@ kernel void kernel_restore_block_q5_1( } } +kernel void kernel_convert_block_q5_1_noshuffle( + global struct block_q5_1 * src0, + global uchar * dst_q, + global uint * dst_qh, + global half * dst_d, + global half * dst_m +) { + global struct block_q5_1 * b = (global struct block_q5_1 *) src0 + get_global_id(0); + global uchar * q = (global uchar *) dst_q + QK5_1/2*get_global_id(0); + global uint * qh = (global uint *) dst_qh + get_global_id(0); + global half * d = (global half *) dst_d + get_global_id(0); + global half * m = (global half *) dst_m + get_global_id(0); + + *d = b->d; + *m = b->m; + *qh = *((global uint *)(b->qh)); + + for (int i = 0; i < QK5_1/4; ++i) { + uchar x0 = b->qs[2*i + 0]; + uchar x1 = b->qs[2*i + 1]; + + q[i + 0 ] = convert_uchar(x0 & 0x0F) | convert_uchar((x1 & 0x0F) << 4); + q[i + QK5_1/4] = convert_uchar((x0 & 0xF0) >> 4) | convert_uchar(x1 & 0xF0); + +#ifdef ADRENO_GPU + if (get_global_id(0) == 65536*4096) { + printf("%04x - %02x\n", *(global ushort*)d, ((x0 & 0xF0) >> 4) | (x1 & 0xF0)); + } +#endif + } +} + +kernel void kernel_restore_block_q5_1_noshuffle( + global uchar * src_q, + global uint * src_qh, + global half * src_d, + global half * src_m, + global struct block_q5_1 * dst, + uchar mask_0F, + uchar mask_F0 +) { + global struct block_q5_1 * b = (global struct block_q5_1 *) dst + get_global_id(0); + global uchar * q = (global uchar *) src_q + QK5_1/2*get_global_id(0); + global uint * qh = (global uint *) src_qh + get_global_id(0); + global half * d = (global half *) src_d + get_global_id(0); + global half * m = (global half *) src_m + get_global_id(0); + + b->d = *d; + b->m = *m; + *((global uint *)(b->qh)) = *qh; + + for (int i = 0; i < QK5_1/4; ++i) { + uchar x0 = q[i + 0 ]; + uchar x1 = q[i + QK5_1/4]; + + b->qs[2*i + 0] = convert_uchar((x0 & mask_0F) | ((x1 & mask_0F) << 4)); + b->qs[2*i + 1] = convert_uchar(((x0 & mask_F0) >> 4) | (x1 & mask_F0)); + } +} + kernel void kernel_convert_block_q5_1_trans4_ns( __global struct block_q5_1 * src0, __global uint * dst_qs, diff --git a/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_0_f32.cl b/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_0_f32.cl new file mode 100644 index 000000000000..1d6bd48005e3 --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_0_f32.cl @@ -0,0 +1,131 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable + +#ifdef cl_qcom_reqd_sub_group_size +#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable +#define ADRENO_GPU 1 +#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full"))) +#endif + +#ifdef ADRENO_GPU +REQD_SUBGROUP_SIZE_128 +#endif + +kernel void kernel_gemm_noshuffle_q5_0_f32( + global const ushort * src0_qs, // quantized A + global const uchar * src0_qh, // 5th bits + global const half * src0_d, // A scales + __read_only image1d_buffer_t src1, // B (1d image) + global float * dst, // C + int m, // M + int n, // N with padding + int k, // K + int n_no_padding // N without padding +) { + + int n_4 = n >> 2; + + int gy = get_global_id(0); + int gx = get_global_id(1); + int gx_2 = gx << 2; + + half8 c0 = 0, c1 = 0, c2 = 0, c3 = 0; + half8 B; + half4 dequantized_weights; + + global const ushort * weight_ptr = src0_qs + gx_2; + global const uchar * qh_ptr = src0_qh + gx_2; + global const half * scale_ptr = src0_d + gx_2; + + for (int i = 0; i < k; i += 4) { + + B.s0123 = read_imageh(src1, gy*2 + i*n_4); + B.s4567 = read_imageh(src1, gy*2 + i*n_4 + 1); + + ushort4 bits4 = vload4(0, weight_ptr + (i >> 2)*m); + uchar4 bits1 = vload4(0, qh_ptr + (i >> 3)*m); + uchar4 qh = bits1 >> (uchar4)(i & 4); + + half4 scale = vload4(0, scale_ptr + (i >> 5)*m); + + // j=0 + dequantized_weights.s0 = (convert_half((bits4.s0 & 0x000F) | ((qh.s0 & 0x01) << 4)) - 16.0h) * scale.s0; + dequantized_weights.s1 = (convert_half((bits4.s1 & 0x000F) | ((qh.s1 & 0x01) << 4)) - 16.0h) * scale.s1; + dequantized_weights.s2 = (convert_half((bits4.s2 & 0x000F) | ((qh.s2 & 0x01) << 4)) - 16.0h) * scale.s2; + dequantized_weights.s3 = (convert_half((bits4.s3 & 0x000F) | ((qh.s3 & 0x01) << 4)) - 16.0h) * scale.s3; + c0 += B * dequantized_weights.s0; + c1 += B * dequantized_weights.s1; + c2 += B * dequantized_weights.s2; + c3 += B * dequantized_weights.s3; + + // j=1 + B.s0123 = read_imageh(src1, gy*2 + (i+1)*n_4); + B.s4567 = read_imageh(src1, gy*2 + (i+1)*n_4 + 1); + dequantized_weights.s0 = (convert_half(((bits4.s0 & 0x00F0) >> 4) | ((qh.s0 & 0x02) << 3)) - 16.0h) * scale.s0; + dequantized_weights.s1 = (convert_half(((bits4.s1 & 0x00F0) >> 4) | ((qh.s1 & 0x02) << 3)) - 16.0h) * scale.s1; + dequantized_weights.s2 = (convert_half(((bits4.s2 & 0x00F0) >> 4) | ((qh.s2 & 0x02) << 3)) - 16.0h) * scale.s2; + dequantized_weights.s3 = (convert_half(((bits4.s3 & 0x00F0) >> 4) | ((qh.s3 & 0x02) << 3)) - 16.0h) * scale.s3; + c0 += B * dequantized_weights.s0; + c1 += B * dequantized_weights.s1; + c2 += B * dequantized_weights.s2; + c3 += B * dequantized_weights.s3; + + // j=2 + B.s0123 = read_imageh(src1, gy*2 + (i+2)*n_4); + B.s4567 = read_imageh(src1, gy*2 + (i+2)*n_4 + 1); + dequantized_weights.s0 = (convert_half(((bits4.s0 & 0x0F00) >> 8) | ((qh.s0 & 0x04) << 2)) - 16.0h) * scale.s0; + dequantized_weights.s1 = (convert_half(((bits4.s1 & 0x0F00) >> 8) | ((qh.s1 & 0x04) << 2)) - 16.0h) * scale.s1; + dequantized_weights.s2 = (convert_half(((bits4.s2 & 0x0F00) >> 8) | ((qh.s2 & 0x04) << 2)) - 16.0h) * scale.s2; + dequantized_weights.s3 = (convert_half(((bits4.s3 & 0x0F00) >> 8) | ((qh.s3 & 0x04) << 2)) - 16.0h) * scale.s3; + c0 += B * dequantized_weights.s0; + c1 += B * dequantized_weights.s1; + c2 += B * dequantized_weights.s2; + c3 += B * dequantized_weights.s3; + + // j=3 + B.s0123 = read_imageh(src1, gy*2 + (i+3)*n_4); + B.s4567 = read_imageh(src1, gy*2 + (i+3)*n_4 + 1); + dequantized_weights.s0 = (convert_half(((bits4.s0 & 0xF000) >> 12) | ((qh.s0 & 0x08) << 1)) - 16.0h) * scale.s0; + dequantized_weights.s1 = (convert_half(((bits4.s1 & 0xF000) >> 12) | ((qh.s1 & 0x08) << 1)) - 16.0h) * scale.s1; + dequantized_weights.s2 = (convert_half(((bits4.s2 & 0xF000) >> 12) | ((qh.s2 & 0x08) << 1)) - 16.0h) * scale.s2; + dequantized_weights.s3 = (convert_half(((bits4.s3 & 0xF000) >> 12) | ((qh.s3 & 0x08) << 1)) - 16.0h) * scale.s3; + c0 += B * dequantized_weights.s0; + c1 += B * dequantized_weights.s1; + c2 += B * dequantized_weights.s2; + c3 += B * dequantized_weights.s3; + } + + int idx = (gy<<3)*m + (gx<<2); + + if(idx+3 < m*n_no_padding){ + vstore4((float4)(c0.s0, c1.s0, c2.s0, c3.s0), 0, dst + idx); + idx += m; + } + if(idx+3 < m*n_no_padding){ + vstore4((float4)(c0.s1, c1.s1, c2.s1, c3.s1), 0, dst + idx); + idx += m; + } + if(idx+3 < m*n_no_padding){ + vstore4((float4)(c0.s2, c1.s2, c2.s2, c3.s2), 0, dst + idx); + idx += m; + } + if(idx+3 < m*n_no_padding){ + vstore4((float4)(c0.s3, c1.s3, c2.s3, c3.s3), 0, dst + idx); + idx += m; + } + if(idx+3 < m*n_no_padding){ + vstore4((float4)(c0.s4, c1.s4, c2.s4, c3.s4), 0, dst + idx); + idx += m; + } + if(idx+3 < m*n_no_padding){ + vstore4((float4)(c0.s5, c1.s5, c2.s5, c3.s5), 0, dst + idx); + idx += m; + } + if(idx+3 < m*n_no_padding){ + vstore4((float4)(c0.s6, c1.s6, c2.s6, c3.s6), 0, dst + idx); + idx += m; + } + if(idx+3 < m*n_no_padding){ + vstore4((float4)(c0.s7, c1.s7, c2.s7, c3.s7), 0, dst + idx); + } +} diff --git a/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_1_f32.cl b/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_1_f32.cl new file mode 100644 index 000000000000..94b4ef6caccd --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/gemm_noshuffle_q5_1_f32.cl @@ -0,0 +1,134 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable + +#ifdef cl_qcom_reqd_sub_group_size +#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable +#define ADRENO_GPU 1 +#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full"))) +#endif + +#ifdef ADRENO_GPU +REQD_SUBGROUP_SIZE_128 +#endif + +kernel void kernel_gemm_noshuffle_q5_1_f32( + global const ushort * src0_qs, // quantized A + global const uchar * src0_qh, // 5th bits + global const half * src0_d, // A scales + global const half * src0_m, // A mins + __read_only image1d_buffer_t src1, // B (1d image) + global float * dst, // C + int m, // M + int n, // N with padding + int k, // K + int n_no_padding // N without padding +) { + + int n_4 = n >> 2; + + int gy = get_global_id(0); + int gx = get_global_id(1); + int gx_2 = gx << 2; + + half8 c0 = 0, c1 = 0, c2 = 0, c3 = 0; + half8 B; + half4 dequantized_weights; + + global const ushort * weight_ptr = src0_qs + gx_2; + global const uchar * qh_ptr = src0_qh + gx_2; + global const half * scale_ptr = src0_d + gx_2; + global const half * min_ptr = src0_m + gx_2; + + for (int i = 0; i < k; i += 4) { + + B.s0123 = read_imageh(src1, gy*2 + i*n_4); + B.s4567 = read_imageh(src1, gy*2 + i*n_4 + 1); + + ushort4 bits4 = vload4(0, weight_ptr + (i >> 2)*m); + uchar4 bits1 = vload4(0, qh_ptr + (i >> 3)*m); + uchar4 qh = bits1 >> (uchar4)(i & 4); + + half4 scale = vload4(0, scale_ptr + (i >> 5)*m); + half4 minv = vload4(0, min_ptr + (i >> 5)*m); + + // j=0 + dequantized_weights.s0 = convert_half((bits4.s0 & 0x000F) | ((qh.s0 & 0x01) << 4)) * scale.s0 + minv.s0; + dequantized_weights.s1 = convert_half((bits4.s1 & 0x000F) | ((qh.s1 & 0x01) << 4)) * scale.s1 + minv.s1; + dequantized_weights.s2 = convert_half((bits4.s2 & 0x000F) | ((qh.s2 & 0x01) << 4)) * scale.s2 + minv.s2; + dequantized_weights.s3 = convert_half((bits4.s3 & 0x000F) | ((qh.s3 & 0x01) << 4)) * scale.s3 + minv.s3; + c0 += B * dequantized_weights.s0; + c1 += B * dequantized_weights.s1; + c2 += B * dequantized_weights.s2; + c3 += B * dequantized_weights.s3; + + // j=1 + B.s0123 = read_imageh(src1, gy*2 + (i+1)*n_4); + B.s4567 = read_imageh(src1, gy*2 + (i+1)*n_4 + 1); + dequantized_weights.s0 = convert_half(((bits4.s0 & 0x00F0) >> 4) | ((qh.s0 & 0x02) << 3)) * scale.s0 + minv.s0; + dequantized_weights.s1 = convert_half(((bits4.s1 & 0x00F0) >> 4) | ((qh.s1 & 0x02) << 3)) * scale.s1 + minv.s1; + dequantized_weights.s2 = convert_half(((bits4.s2 & 0x00F0) >> 4) | ((qh.s2 & 0x02) << 3)) * scale.s2 + minv.s2; + dequantized_weights.s3 = convert_half(((bits4.s3 & 0x00F0) >> 4) | ((qh.s3 & 0x02) << 3)) * scale.s3 + minv.s3; + c0 += B * dequantized_weights.s0; + c1 += B * dequantized_weights.s1; + c2 += B * dequantized_weights.s2; + c3 += B * dequantized_weights.s3; + + // j=2 + B.s0123 = read_imageh(src1, gy*2 + (i+2)*n_4); + B.s4567 = read_imageh(src1, gy*2 + (i+2)*n_4 + 1); + dequantized_weights.s0 = convert_half(((bits4.s0 & 0x0F00) >> 8) | ((qh.s0 & 0x04) << 2)) * scale.s0 + minv.s0; + dequantized_weights.s1 = convert_half(((bits4.s1 & 0x0F00) >> 8) | ((qh.s1 & 0x04) << 2)) * scale.s1 + minv.s1; + dequantized_weights.s2 = convert_half(((bits4.s2 & 0x0F00) >> 8) | ((qh.s2 & 0x04) << 2)) * scale.s2 + minv.s2; + dequantized_weights.s3 = convert_half(((bits4.s3 & 0x0F00) >> 8) | ((qh.s3 & 0x04) << 2)) * scale.s3 + minv.s3; + c0 += B * dequantized_weights.s0; + c1 += B * dequantized_weights.s1; + c2 += B * dequantized_weights.s2; + c3 += B * dequantized_weights.s3; + + // j=3 + B.s0123 = read_imageh(src1, gy*2 + (i+3)*n_4); + B.s4567 = read_imageh(src1, gy*2 + (i+3)*n_4 + 1); + dequantized_weights.s0 = convert_half(((bits4.s0 & 0xF000) >> 12) | ((qh.s0 & 0x08) << 1)) * scale.s0 + minv.s0; + dequantized_weights.s1 = convert_half(((bits4.s1 & 0xF000) >> 12) | ((qh.s1 & 0x08) << 1)) * scale.s1 + minv.s1; + dequantized_weights.s2 = convert_half(((bits4.s2 & 0xF000) >> 12) | ((qh.s2 & 0x08) << 1)) * scale.s2 + minv.s2; + dequantized_weights.s3 = convert_half(((bits4.s3 & 0xF000) >> 12) | ((qh.s3 & 0x08) << 1)) * scale.s3 + minv.s3; + c0 += B * dequantized_weights.s0; + c1 += B * dequantized_weights.s1; + c2 += B * dequantized_weights.s2; + c3 += B * dequantized_weights.s3; + } + + int idx = (gy<<3)*m + (gx<<2); + + if(idx+3 < m*n_no_padding){ + vstore4((float4)(c0.s0, c1.s0, c2.s0, c3.s0), 0, dst + idx); + idx += m; + } + if(idx+3 < m*n_no_padding){ + vstore4((float4)(c0.s1, c1.s1, c2.s1, c3.s1), 0, dst + idx); + idx += m; + } + if(idx+3 < m*n_no_padding){ + vstore4((float4)(c0.s2, c1.s2, c2.s2, c3.s2), 0, dst + idx); + idx += m; + } + if(idx+3 < m*n_no_padding){ + vstore4((float4)(c0.s3, c1.s3, c2.s3, c3.s3), 0, dst + idx); + idx += m; + } + if(idx+3 < m*n_no_padding){ + vstore4((float4)(c0.s4, c1.s4, c2.s4, c3.s4), 0, dst + idx); + idx += m; + } + if(idx+3 < m*n_no_padding){ + vstore4((float4)(c0.s5, c1.s5, c2.s5, c3.s5), 0, dst + idx); + idx += m; + } + if(idx+3 < m*n_no_padding){ + vstore4((float4)(c0.s6, c1.s6, c2.s6, c3.s6), 0, dst + idx); + idx += m; + } + if(idx+3 < m*n_no_padding){ + vstore4((float4)(c0.s7, c1.s7, c2.s7, c3.s7), 0, dst + idx); + } +} diff --git a/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_0_f32.cl b/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_0_f32.cl new file mode 100644 index 000000000000..c228f717a94b --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_0_f32.cl @@ -0,0 +1,291 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#pragma OPENCL EXTENSION cl_khr_subgroups : enable + +#ifdef cl_qcom_reqd_sub_group_size +#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable +#define ADRENO_GPU 1 +#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half"))) +#endif + +#define QK5_0 32 +#define NSUBGROUPS 4 +#define SUBGROUP_SIZE 64 + +#define dequantizeBlockAccum_ns_q5_0_sgbroadcast_1_hi(total_sums, bits4, bits1, scale, y) \ + float shared_y; \ + shared_y = sub_group_broadcast(y.s0, 0); \ + total_sums.s0 += (((bits4.s0 & 0x000F) | (((bits1.s0 ) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \ + total_sums.s1 += (((bits4.s1 & 0x000F) | (((bits1.s4 ) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \ + shared_y = sub_group_broadcast(y.s1, 0); \ + total_sums.s0 += ((((bits4.s0 & 0x00F0) >> 4) | (((bits1.s0 >> 1) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \ + total_sums.s1 += ((((bits4.s1 & 0x00F0) >> 4) | (((bits1.s4 >> 1) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \ + shared_y = sub_group_broadcast(y.s2, 0); \ + total_sums.s0 += ((((bits4.s0 & 0x0F00) >> 8) | (((bits1.s0 >> 2) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \ + total_sums.s1 += ((((bits4.s1 & 0x0F00) >> 8) | (((bits1.s4 >> 2) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \ + shared_y = sub_group_broadcast(y.s3, 0); \ + total_sums.s0 += ((((bits4.s0 & 0xF000) >> 12) | (((bits1.s0 >> 3) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \ + total_sums.s1 += ((((bits4.s1 & 0xF000) >> 12) | (((bits1.s4 >> 3) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \ + shared_y = sub_group_broadcast(y.s4, 0); \ + total_sums.s0 += (((bits4.s2 & 0x000F) | (((bits1.s0 >> 4) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \ + total_sums.s1 += (((bits4.s3 & 0x000F) | (((bits1.s4 >> 4) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \ + shared_y = sub_group_broadcast(y.s5, 0); \ + total_sums.s0 += ((((bits4.s2 & 0x00F0) >> 4) | (((bits1.s0 >> 5) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \ + total_sums.s1 += ((((bits4.s3 & 0x00F0) >> 4) | (((bits1.s4 >> 5) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \ + shared_y = sub_group_broadcast(y.s6, 0); \ + total_sums.s0 += ((((bits4.s2 & 0x0F00) >> 8) | (((bits1.s0 >> 6) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \ + total_sums.s1 += ((((bits4.s3 & 0x0F00) >> 8) | (((bits1.s4 >> 6) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \ + shared_y = sub_group_broadcast(y.s7, 0); \ + total_sums.s0 += ((((bits4.s2 & 0xF000) >> 12) | (((bits1.s0 >> 7) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \ + total_sums.s1 += ((((bits4.s3 & 0xF000) >> 12) | (((bits1.s4 >> 7) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \ + shared_y = sub_group_broadcast(y.s0, 1); \ + total_sums.s0 += (((bits4.s4 & 0x000F) | (((bits1.s1 ) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \ + total_sums.s1 += (((bits4.s5 & 0x000F) | (((bits1.s5 ) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \ + shared_y = sub_group_broadcast(y.s1, 1); \ + total_sums.s0 += ((((bits4.s4 & 0x00F0) >> 4) | (((bits1.s1 >> 1) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \ + total_sums.s1 += ((((bits4.s5 & 0x00F0) >> 4) | (((bits1.s5 >> 1) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \ + shared_y = sub_group_broadcast(y.s2, 1); \ + total_sums.s0 += ((((bits4.s4 & 0x0F00) >> 8) | (((bits1.s1 >> 2) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \ + total_sums.s1 += ((((bits4.s5 & 0x0F00) >> 8) | (((bits1.s5 >> 2) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \ + shared_y = sub_group_broadcast(y.s3, 1); \ + total_sums.s0 += ((((bits4.s4 & 0xF000) >> 12) | (((bits1.s1 >> 3) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \ + total_sums.s1 += ((((bits4.s5 & 0xF000) >> 12) | (((bits1.s5 >> 3) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \ + shared_y = sub_group_broadcast(y.s4, 1); \ + total_sums.s0 += (((bits4.s6 & 0x000F) | (((bits1.s1 >> 4) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \ + total_sums.s1 += (((bits4.s7 & 0x000F) | (((bits1.s5 >> 4) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \ + shared_y = sub_group_broadcast(y.s5, 1); \ + total_sums.s0 += ((((bits4.s6 & 0x00F0) >> 4) | (((bits1.s1 >> 5) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \ + total_sums.s1 += ((((bits4.s7 & 0x00F0) >> 4) | (((bits1.s5 >> 5) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \ + shared_y = sub_group_broadcast(y.s6, 1); \ + total_sums.s0 += ((((bits4.s6 & 0x0F00) >> 8) | (((bits1.s1 >> 6) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \ + total_sums.s1 += ((((bits4.s7 & 0x0F00) >> 8) | (((bits1.s5 >> 6) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \ + shared_y = sub_group_broadcast(y.s7, 1); \ + total_sums.s0 += ((((bits4.s6 & 0xF000) >> 12) | (((bits1.s1 >> 7) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \ + total_sums.s1 += ((((bits4.s7 & 0xF000) >> 12) | (((bits1.s5 >> 7) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \ + + +#define dequantizeBlockAccum_ns_q5_0_sgbroadcast_1_lo(total_sums, bits4, bits1, scale, y) \ + shared_y = sub_group_broadcast(y.s0, 2); \ + total_sums.s0 += (((bits4.s0 & 0x000F) | (((bits1.s2 ) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \ + total_sums.s1 += (((bits4.s1 & 0x000F) | (((bits1.s6 ) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \ + shared_y = sub_group_broadcast(y.s1, 2); \ + total_sums.s0 += ((((bits4.s0 & 0x00F0) >> 4) | (((bits1.s2 >> 1) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \ + total_sums.s1 += ((((bits4.s1 & 0x00F0) >> 4) | (((bits1.s6 >> 1) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \ + shared_y = sub_group_broadcast(y.s2, 2); \ + total_sums.s0 += ((((bits4.s0 & 0x0F00) >> 8) | (((bits1.s2 >> 2) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \ + total_sums.s1 += ((((bits4.s1 & 0x0F00) >> 8) | (((bits1.s6 >> 2) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \ + shared_y = sub_group_broadcast(y.s3, 2); \ + total_sums.s0 += ((((bits4.s0 & 0xF000) >> 12) | (((bits1.s2 >> 3) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \ + total_sums.s1 += ((((bits4.s1 & 0xF000) >> 12) | (((bits1.s6 >> 3) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \ + shared_y = sub_group_broadcast(y.s4, 2); \ + total_sums.s0 += (((bits4.s2 & 0x000F) | (((bits1.s2 >> 4) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \ + total_sums.s1 += (((bits4.s3 & 0x000F) | (((bits1.s6 >> 4) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \ + shared_y = sub_group_broadcast(y.s5, 2); \ + total_sums.s0 += ((((bits4.s2 & 0x00F0) >> 4) | (((bits1.s2 >> 5) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \ + total_sums.s1 += ((((bits4.s3 & 0x00F0) >> 4) | (((bits1.s6 >> 5) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \ + shared_y = sub_group_broadcast(y.s6, 2); \ + total_sums.s0 += ((((bits4.s2 & 0x0F00) >> 8) | (((bits1.s2 >> 6) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \ + total_sums.s1 += ((((bits4.s3 & 0x0F00) >> 8) | (((bits1.s6 >> 6) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \ + shared_y = sub_group_broadcast(y.s7, 2); \ + total_sums.s0 += ((((bits4.s2 & 0xF000) >> 12) | (((bits1.s2 >> 7) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \ + total_sums.s1 += ((((bits4.s3 & 0xF000) >> 12) | (((bits1.s6 >> 7) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \ + shared_y = sub_group_broadcast(y.s0, 3); \ + total_sums.s0 += (((bits4.s4 & 0x000F) | (((bits1.s3 ) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \ + total_sums.s1 += (((bits4.s5 & 0x000F) | (((bits1.s7 ) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \ + shared_y = sub_group_broadcast(y.s1, 3); \ + total_sums.s0 += ((((bits4.s4 & 0x00F0) >> 4) | (((bits1.s3 >> 1) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \ + total_sums.s1 += ((((bits4.s5 & 0x00F0) >> 4) | (((bits1.s7 >> 1) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \ + shared_y = sub_group_broadcast(y.s2, 3); \ + total_sums.s0 += ((((bits4.s4 & 0x0F00) >> 8) | (((bits1.s3 >> 2) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \ + total_sums.s1 += ((((bits4.s5 & 0x0F00) >> 8) | (((bits1.s7 >> 2) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \ + shared_y = sub_group_broadcast(y.s3, 3); \ + total_sums.s0 += ((((bits4.s4 & 0xF000) >> 12) | (((bits1.s3 >> 3) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \ + total_sums.s1 += ((((bits4.s5 & 0xF000) >> 12) | (((bits1.s7 >> 3) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \ + shared_y = sub_group_broadcast(y.s4, 3); \ + total_sums.s0 += (((bits4.s6 & 0x000F) | (((bits1.s3 >> 4) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \ + total_sums.s1 += (((bits4.s7 & 0x000F) | (((bits1.s7 >> 4) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \ + shared_y = sub_group_broadcast(y.s5, 3); \ + total_sums.s0 += ((((bits4.s6 & 0x00F0) >> 4) | (((bits1.s3 >> 5) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \ + total_sums.s1 += ((((bits4.s7 & 0x00F0) >> 4) | (((bits1.s7 >> 5) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \ + shared_y = sub_group_broadcast(y.s6, 3); \ + total_sums.s0 += ((((bits4.s6 & 0x0F00) >> 8) | (((bits1.s3 >> 6) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \ + total_sums.s1 += ((((bits4.s7 & 0x0F00) >> 8) | (((bits1.s7 >> 6) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \ + shared_y = sub_group_broadcast(y.s7, 3); \ + total_sums.s0 += ((((bits4.s6 & 0xF000) >> 12) | (((bits1.s3 >> 7) & 0x01) << 4)) - 16) * scale.s0 * shared_y; \ + total_sums.s1 += ((((bits4.s7 & 0xF000) >> 12) | (((bits1.s7 >> 7) & 0x01) << 4)) - 16) * scale.s1 * shared_y; \ + + +#define dequantizeBlockAccum_ns_q5_0_sgbroadcast_8_hi(total_sums, bits4, bits1, scale, y) \ + float8 shared_y; \ + shared_y = sub_group_broadcast(y, 0); \ + total_sums.s0 += (((bits4.s0 & 0x000F) | (((bits1.s0 ) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s0; \ + total_sums.s0 += ((((bits4.s0 & 0x00F0) >> 4) | (((bits1.s0 >> 1) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s1; \ + total_sums.s0 += ((((bits4.s0 & 0x0F00) >> 8) | (((bits1.s0 >> 2) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s2; \ + total_sums.s0 += ((((bits4.s0 & 0xF000) >> 12) | (((bits1.s0 >> 3) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s3; \ + total_sums.s0 += (((bits4.s2 & 0x000F) | (((bits1.s0 >> 4) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s4; \ + total_sums.s0 += ((((bits4.s2 & 0x00F0) >> 4) | (((bits1.s0 >> 5) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s5; \ + total_sums.s0 += ((((bits4.s2 & 0x0F00) >> 8) | (((bits1.s0 >> 6) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s6; \ + total_sums.s0 += ((((bits4.s2 & 0xF000) >> 12) | (((bits1.s0 >> 7) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s7; \ + total_sums.s1 += (((bits4.s1 & 0x000F) | (((bits1.s4 ) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s0; \ + total_sums.s1 += ((((bits4.s1 & 0x00F0) >> 4) | (((bits1.s4 >> 1) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s1; \ + total_sums.s1 += ((((bits4.s1 & 0x0F00) >> 8) | (((bits1.s4 >> 2) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s2; \ + total_sums.s1 += ((((bits4.s1 & 0xF000) >> 12) | (((bits1.s4 >> 3) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s3; \ + total_sums.s1 += (((bits4.s3 & 0x000F) | (((bits1.s4 >> 4) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s4; \ + total_sums.s1 += ((((bits4.s3 & 0x00F0) >> 4) | (((bits1.s4 >> 5) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s5; \ + total_sums.s1 += ((((bits4.s3 & 0x0F00) >> 8) | (((bits1.s4 >> 6) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s6; \ + total_sums.s1 += ((((bits4.s3 & 0xF000) >> 12) | (((bits1.s4 >> 7) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s7; \ + shared_y = sub_group_broadcast(y, 1); \ + total_sums.s0 += (((bits4.s4 & 0x000F) | (((bits1.s1 ) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s0; \ + total_sums.s0 += ((((bits4.s4 & 0x00F0) >> 4) | (((bits1.s1 >> 1) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s1; \ + total_sums.s0 += ((((bits4.s4 & 0x0F00) >> 8) | (((bits1.s1 >> 2) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s2; \ + total_sums.s0 += ((((bits4.s4 & 0xF000) >> 12) | (((bits1.s1 >> 3) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s3; \ + total_sums.s0 += (((bits4.s6 & 0x000F) | (((bits1.s1 >> 4) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s4; \ + total_sums.s0 += ((((bits4.s6 & 0x00F0) >> 4) | (((bits1.s1 >> 5) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s5; \ + total_sums.s0 += ((((bits4.s6 & 0x0F00) >> 8) | (((bits1.s1 >> 6) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s6; \ + total_sums.s0 += ((((bits4.s6 & 0xF000) >> 12) | (((bits1.s1 >> 7) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s7; \ + total_sums.s1 += (((bits4.s5 & 0x000F) | (((bits1.s5 ) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s0; \ + total_sums.s1 += ((((bits4.s5 & 0x00F0) >> 4) | (((bits1.s5 >> 1) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s1; \ + total_sums.s1 += ((((bits4.s5 & 0x0F00) >> 8) | (((bits1.s5 >> 2) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s2; \ + total_sums.s1 += ((((bits4.s5 & 0xF000) >> 12) | (((bits1.s5 >> 3) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s3; \ + total_sums.s1 += (((bits4.s7 & 0x000F) | (((bits1.s5 >> 4) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s4; \ + total_sums.s1 += ((((bits4.s7 & 0x00F0) >> 4) | (((bits1.s5 >> 5) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s5; \ + total_sums.s1 += ((((bits4.s7 & 0x0F00) >> 8) | (((bits1.s5 >> 6) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s6; \ + total_sums.s1 += ((((bits4.s7 & 0xF000) >> 12) | (((bits1.s5 >> 7) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s7; \ + + +#define dequantizeBlockAccum_ns_q5_0_sgbroadcast_8_lo(total_sums, bits4, bits1, scale, y) \ + shared_y = sub_group_broadcast(y, 2); \ + total_sums.s0 += (((bits4.s0 & 0x000F) | (((bits1.s2 ) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s0; \ + total_sums.s0 += ((((bits4.s0 & 0x00F0) >> 4) | (((bits1.s2 >> 1) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s1; \ + total_sums.s0 += ((((bits4.s0 & 0x0F00) >> 8) | (((bits1.s2 >> 2) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s2; \ + total_sums.s0 += ((((bits4.s0 & 0xF000) >> 12) | (((bits1.s2 >> 3) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s3; \ + total_sums.s0 += (((bits4.s2 & 0x000F) | (((bits1.s2 >> 4) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s4; \ + total_sums.s0 += ((((bits4.s2 & 0x00F0) >> 4) | (((bits1.s2 >> 5) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s5; \ + total_sums.s0 += ((((bits4.s2 & 0x0F00) >> 8) | (((bits1.s2 >> 6) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s6; \ + total_sums.s0 += ((((bits4.s2 & 0xF000) >> 12) | (((bits1.s2 >> 7) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s7; \ + total_sums.s1 += (((bits4.s1 & 0x000F) | (((bits1.s6 ) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s0; \ + total_sums.s1 += ((((bits4.s1 & 0x00F0) >> 4) | (((bits1.s6 >> 1) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s1; \ + total_sums.s1 += ((((bits4.s1 & 0x0F00) >> 8) | (((bits1.s6 >> 2) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s2; \ + total_sums.s1 += ((((bits4.s1 & 0xF000) >> 12) | (((bits1.s6 >> 3) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s3; \ + total_sums.s1 += (((bits4.s3 & 0x000F) | (((bits1.s6 >> 4) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s4; \ + total_sums.s1 += ((((bits4.s3 & 0x00F0) >> 4) | (((bits1.s6 >> 5) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s5; \ + total_sums.s1 += ((((bits4.s3 & 0x0F00) >> 8) | (((bits1.s6 >> 6) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s6; \ + total_sums.s1 += ((((bits4.s3 & 0xF000) >> 12) | (((bits1.s6 >> 7) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s7; \ + shared_y = sub_group_broadcast(y, 3); \ + total_sums.s0 += (((bits4.s4 & 0x000F) | (((bits1.s3 ) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s0; \ + total_sums.s0 += ((((bits4.s4 & 0x00F0) >> 4) | (((bits1.s3 >> 1) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s1; \ + total_sums.s0 += ((((bits4.s4 & 0x0F00) >> 8) | (((bits1.s3 >> 2) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s2; \ + total_sums.s0 += ((((bits4.s4 & 0xF000) >> 12) | (((bits1.s3 >> 3) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s3; \ + total_sums.s0 += (((bits4.s6 & 0x000F) | (((bits1.s3 >> 4) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s4; \ + total_sums.s0 += ((((bits4.s6 & 0x00F0) >> 4) | (((bits1.s3 >> 5) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s5; \ + total_sums.s0 += ((((bits4.s6 & 0x0F00) >> 8) | (((bits1.s3 >> 6) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s6; \ + total_sums.s0 += ((((bits4.s6 & 0xF000) >> 12) | (((bits1.s3 >> 7) & 0x01) << 4)) - 16) * scale.s0 * shared_y.s7; \ + total_sums.s1 += (((bits4.s5 & 0x000F) | (((bits1.s7 ) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s0; \ + total_sums.s1 += ((((bits4.s5 & 0x00F0) >> 4) | (((bits1.s7 >> 1) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s1; \ + total_sums.s1 += ((((bits4.s5 & 0x0F00) >> 8) | (((bits1.s7 >> 2) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s2; \ + total_sums.s1 += ((((bits4.s5 & 0xF000) >> 12) | (((bits1.s7 >> 3) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s3; \ + total_sums.s1 += (((bits4.s7 & 0x000F) | (((bits1.s7 >> 4) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s4; \ + total_sums.s1 += ((((bits4.s7 & 0x00F0) >> 4) | (((bits1.s7 >> 5) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s5; \ + total_sums.s1 += ((((bits4.s7 & 0x0F00) >> 8) | (((bits1.s7 >> 6) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s6; \ + total_sums.s1 += ((((bits4.s7 & 0xF000) >> 12) | (((bits1.s7 >> 7) & 0x01) << 4)) - 16) * scale.s1 * shared_y.s7; \ + +#ifdef ADRENO_GPU +REQD_SUBGROUP_SIZE_64 +#endif +__kernel void kernel_gemv_noshuffle_q5_0_f32( + __read_only image1d_buffer_t src0_qs, // quantized A + global ushort * src0_qh, // 5th bits + global half2 * src0_d, // A scales + __read_only image1d_buffer_t src1, // B activations + global float * dst, + ulong offsetd, + int ne00, // K + int ne01) // M +{ + uint groupId = get_local_id(1); + uint gid = get_global_id(0); + ushort slid = get_sub_group_local_id(); + + uint K = ne00; + uint M = ne01; + + uint LINE_STRIDE_A = M / 2; + uint BLOCK_STRIDE_A = NSUBGROUPS * M; + + private uint4 regA; + private half2 regS; + private float8 regB; + + private float2 totalSum = (float2)(0.0f); + + for (uint k = groupId; k < (K / QK5_0); k += NSUBGROUPS) { + regS = src0_d[gid + k * LINE_STRIDE_A]; + + ushort4 qh_raw; + qh_raw.s0 = src0_qh[gid + (4*k + 0) * LINE_STRIDE_A]; + qh_raw.s1 = src0_qh[gid + (4*k + 1) * LINE_STRIDE_A]; + qh_raw.s2 = src0_qh[gid + (4*k + 2) * LINE_STRIDE_A]; + qh_raw.s3 = src0_qh[gid + (4*k + 3) * LINE_STRIDE_A]; + + uchar8 raw = as_uchar8(qh_raw); + uchar8 qh_bytes = (uchar8)(raw.s0, raw.s2, raw.s4, raw.s6, + raw.s1, raw.s3, raw.s5, raw.s7); + + // Load activations + if (slid < 4) { + regB.s0123 = read_imagef(src1, (slid * 2 + k * 8)); + regB.s4567 = read_imagef(src1, (1 + slid * 2 + k * 8)); + } + + regA.s0 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 0)).x; + regA.s1 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 1)).x; + regA.s2 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 2)).x; + regA.s3 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 3)).x; + +#ifdef VECTOR_SUB_GROUP_BROADCAST + dequantizeBlockAccum_ns_q5_0_sgbroadcast_8_hi(totalSum, as_ushort8(regA), qh_bytes, regS, regB); +#else + dequantizeBlockAccum_ns_q5_0_sgbroadcast_1_hi(totalSum, as_ushort8(regA), qh_bytes, regS, regB); +#endif // VECTOR_SUB_GROUP_BROADCAST + + regA.s0 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 4)).x; + regA.s1 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 5)).x; + regA.s2 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 6)).x; + regA.s3 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 7)).x; +#ifdef VECTOR_SUB_GROUP_BROADCAST + dequantizeBlockAccum_ns_q5_0_sgbroadcast_8_lo(totalSum, as_ushort8(regA), qh_bytes, regS, regB); +#else + dequantizeBlockAccum_ns_q5_0_sgbroadcast_1_lo(totalSum, as_ushort8(regA), qh_bytes, regS, regB); +#endif // VECTOR_SUB_GROUP_BROADCAST + } + + // reduction in local memory, assumes #wave=4 + local float2 reduceLM[SUBGROUP_SIZE * 3]; + if (groupId == 1) { + reduceLM[SUBGROUP_SIZE * 0 + slid] = totalSum; + } + if (groupId == 2) { + reduceLM[SUBGROUP_SIZE * 1 + slid] = totalSum; + } + if (groupId == 3) { + reduceLM[SUBGROUP_SIZE * 2 + slid] = totalSum; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (groupId == 0) { + totalSum += reduceLM[SUBGROUP_SIZE * 0 + slid]; + } + if (groupId == 0) { + totalSum += reduceLM[SUBGROUP_SIZE * 1 + slid]; + } + if (groupId == 0) { + totalSum += reduceLM[SUBGROUP_SIZE * 2 + slid]; + } + + // 2 outputs per fiber in wave 0 + if (groupId == 0) { + dst = (global float*)((global char*)dst + offsetd); + vstore2(totalSum, 0, &(dst[gid * 2])); + } + +} diff --git a/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_1_f32.cl b/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_1_f32.cl new file mode 100644 index 000000000000..daf1308ea4b0 --- /dev/null +++ b/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q5_1_f32.cl @@ -0,0 +1,294 @@ +#pragma OPENCL EXTENSION cl_khr_fp16 : enable +#pragma OPENCL EXTENSION cl_khr_subgroups : enable + +#ifdef cl_qcom_reqd_sub_group_size +#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable +#define ADRENO_GPU 1 +#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half"))) +#endif + +#define QK5_1 32 +#define NSUBGROUPS 4 +#define SUBGROUP_SIZE 64 + +#define dequantizeBlockAccum_ns_q5_1_sgbroadcast_1_hi(total_sums, bits4, bits1, scale, minv, y) \ + float shared_y; \ + shared_y = sub_group_broadcast(y.s0, 0); \ + total_sums.s0 += (((bits4.s0 & 0x000F) | (((bits1.s0 ) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \ + total_sums.s1 += (((bits4.s1 & 0x000F) | (((bits1.s4 ) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s1, 0); \ + total_sums.s0 += ((((bits4.s0 & 0x00F0) >> 4) | (((bits1.s0 >> 1) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \ + total_sums.s1 += ((((bits4.s1 & 0x00F0) >> 4) | (((bits1.s4 >> 1) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s2, 0); \ + total_sums.s0 += ((((bits4.s0 & 0x0F00) >> 8) | (((bits1.s0 >> 2) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \ + total_sums.s1 += ((((bits4.s1 & 0x0F00) >> 8) | (((bits1.s4 >> 2) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s3, 0); \ + total_sums.s0 += ((((bits4.s0 & 0xF000) >> 12) | (((bits1.s0 >> 3) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \ + total_sums.s1 += ((((bits4.s1 & 0xF000) >> 12) | (((bits1.s4 >> 3) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s4, 0); \ + total_sums.s0 += (((bits4.s2 & 0x000F) | (((bits1.s0 >> 4) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \ + total_sums.s1 += (((bits4.s3 & 0x000F) | (((bits1.s4 >> 4) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s5, 0); \ + total_sums.s0 += ((((bits4.s2 & 0x00F0) >> 4) | (((bits1.s0 >> 5) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \ + total_sums.s1 += ((((bits4.s3 & 0x00F0) >> 4) | (((bits1.s4 >> 5) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s6, 0); \ + total_sums.s0 += ((((bits4.s2 & 0x0F00) >> 8) | (((bits1.s0 >> 6) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \ + total_sums.s1 += ((((bits4.s3 & 0x0F00) >> 8) | (((bits1.s4 >> 6) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s7, 0); \ + total_sums.s0 += ((((bits4.s2 & 0xF000) >> 12) | (((bits1.s0 >> 7) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \ + total_sums.s1 += ((((bits4.s3 & 0xF000) >> 12) | (((bits1.s4 >> 7) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s0, 1); \ + total_sums.s0 += (((bits4.s4 & 0x000F) | (((bits1.s1 ) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \ + total_sums.s1 += (((bits4.s5 & 0x000F) | (((bits1.s5 ) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s1, 1); \ + total_sums.s0 += ((((bits4.s4 & 0x00F0) >> 4) | (((bits1.s1 >> 1) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \ + total_sums.s1 += ((((bits4.s5 & 0x00F0) >> 4) | (((bits1.s5 >> 1) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s2, 1); \ + total_sums.s0 += ((((bits4.s4 & 0x0F00) >> 8) | (((bits1.s1 >> 2) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \ + total_sums.s1 += ((((bits4.s5 & 0x0F00) >> 8) | (((bits1.s5 >> 2) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s3, 1); \ + total_sums.s0 += ((((bits4.s4 & 0xF000) >> 12) | (((bits1.s1 >> 3) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \ + total_sums.s1 += ((((bits4.s5 & 0xF000) >> 12) | (((bits1.s5 >> 3) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s4, 1); \ + total_sums.s0 += (((bits4.s6 & 0x000F) | (((bits1.s1 >> 4) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \ + total_sums.s1 += (((bits4.s7 & 0x000F) | (((bits1.s5 >> 4) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s5, 1); \ + total_sums.s0 += ((((bits4.s6 & 0x00F0) >> 4) | (((bits1.s1 >> 5) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \ + total_sums.s1 += ((((bits4.s7 & 0x00F0) >> 4) | (((bits1.s5 >> 5) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s6, 1); \ + total_sums.s0 += ((((bits4.s6 & 0x0F00) >> 8) | (((bits1.s1 >> 6) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \ + total_sums.s1 += ((((bits4.s7 & 0x0F00) >> 8) | (((bits1.s5 >> 6) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s7, 1); \ + total_sums.s0 += ((((bits4.s6 & 0xF000) >> 12) | (((bits1.s1 >> 7) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \ + total_sums.s1 += ((((bits4.s7 & 0xF000) >> 12) | (((bits1.s5 >> 7) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \ + + +#define dequantizeBlockAccum_ns_q5_1_sgbroadcast_1_lo(total_sums, bits4, bits1, scale, minv, y) \ + shared_y = sub_group_broadcast(y.s0, 2); \ + total_sums.s0 += (((bits4.s0 & 0x000F) | (((bits1.s2 ) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \ + total_sums.s1 += (((bits4.s1 & 0x000F) | (((bits1.s6 ) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s1, 2); \ + total_sums.s0 += ((((bits4.s0 & 0x00F0) >> 4) | (((bits1.s2 >> 1) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \ + total_sums.s1 += ((((bits4.s1 & 0x00F0) >> 4) | (((bits1.s6 >> 1) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s2, 2); \ + total_sums.s0 += ((((bits4.s0 & 0x0F00) >> 8) | (((bits1.s2 >> 2) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \ + total_sums.s1 += ((((bits4.s1 & 0x0F00) >> 8) | (((bits1.s6 >> 2) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s3, 2); \ + total_sums.s0 += ((((bits4.s0 & 0xF000) >> 12) | (((bits1.s2 >> 3) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \ + total_sums.s1 += ((((bits4.s1 & 0xF000) >> 12) | (((bits1.s6 >> 3) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s4, 2); \ + total_sums.s0 += (((bits4.s2 & 0x000F) | (((bits1.s2 >> 4) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \ + total_sums.s1 += (((bits4.s3 & 0x000F) | (((bits1.s6 >> 4) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s5, 2); \ + total_sums.s0 += ((((bits4.s2 & 0x00F0) >> 4) | (((bits1.s2 >> 5) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \ + total_sums.s1 += ((((bits4.s3 & 0x00F0) >> 4) | (((bits1.s6 >> 5) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s6, 2); \ + total_sums.s0 += ((((bits4.s2 & 0x0F00) >> 8) | (((bits1.s2 >> 6) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \ + total_sums.s1 += ((((bits4.s3 & 0x0F00) >> 8) | (((bits1.s6 >> 6) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s7, 2); \ + total_sums.s0 += ((((bits4.s2 & 0xF000) >> 12) | (((bits1.s2 >> 7) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \ + total_sums.s1 += ((((bits4.s3 & 0xF000) >> 12) | (((bits1.s6 >> 7) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s0, 3); \ + total_sums.s0 += (((bits4.s4 & 0x000F) | (((bits1.s3 ) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \ + total_sums.s1 += (((bits4.s5 & 0x000F) | (((bits1.s7 ) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s1, 3); \ + total_sums.s0 += ((((bits4.s4 & 0x00F0) >> 4) | (((bits1.s3 >> 1) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \ + total_sums.s1 += ((((bits4.s5 & 0x00F0) >> 4) | (((bits1.s7 >> 1) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s2, 3); \ + total_sums.s0 += ((((bits4.s4 & 0x0F00) >> 8) | (((bits1.s3 >> 2) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \ + total_sums.s1 += ((((bits4.s5 & 0x0F00) >> 8) | (((bits1.s7 >> 2) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s3, 3); \ + total_sums.s0 += ((((bits4.s4 & 0xF000) >> 12) | (((bits1.s3 >> 3) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \ + total_sums.s1 += ((((bits4.s5 & 0xF000) >> 12) | (((bits1.s7 >> 3) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s4, 3); \ + total_sums.s0 += (((bits4.s6 & 0x000F) | (((bits1.s3 >> 4) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \ + total_sums.s1 += (((bits4.s7 & 0x000F) | (((bits1.s7 >> 4) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s5, 3); \ + total_sums.s0 += ((((bits4.s6 & 0x00F0) >> 4) | (((bits1.s3 >> 5) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \ + total_sums.s1 += ((((bits4.s7 & 0x00F0) >> 4) | (((bits1.s7 >> 5) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s6, 3); \ + total_sums.s0 += ((((bits4.s6 & 0x0F00) >> 8) | (((bits1.s3 >> 6) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \ + total_sums.s1 += ((((bits4.s7 & 0x0F00) >> 8) | (((bits1.s7 >> 6) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \ + shared_y = sub_group_broadcast(y.s7, 3); \ + total_sums.s0 += ((((bits4.s6 & 0xF000) >> 12) | (((bits1.s3 >> 7) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y; \ + total_sums.s1 += ((((bits4.s7 & 0xF000) >> 12) | (((bits1.s7 >> 7) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y; \ + + +#define dequantizeBlockAccum_ns_q5_1_sgbroadcast_8_hi(total_sums, bits4, bits1, scale, minv, y) \ + float8 shared_y; \ + shared_y = sub_group_broadcast(y, 0); \ + total_sums.s0 += (((bits4.s0 & 0x000F) | (((bits1.s0 ) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s0; \ + total_sums.s0 += ((((bits4.s0 & 0x00F0) >> 4) | (((bits1.s0 >> 1) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s1; \ + total_sums.s0 += ((((bits4.s0 & 0x0F00) >> 8) | (((bits1.s0 >> 2) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s2; \ + total_sums.s0 += ((((bits4.s0 & 0xF000) >> 12) | (((bits1.s0 >> 3) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s3; \ + total_sums.s0 += (((bits4.s2 & 0x000F) | (((bits1.s0 >> 4) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s4; \ + total_sums.s0 += ((((bits4.s2 & 0x00F0) >> 4) | (((bits1.s0 >> 5) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s5; \ + total_sums.s0 += ((((bits4.s2 & 0x0F00) >> 8) | (((bits1.s0 >> 6) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s6; \ + total_sums.s0 += ((((bits4.s2 & 0xF000) >> 12) | (((bits1.s0 >> 7) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s7; \ + total_sums.s1 += (((bits4.s1 & 0x000F) | (((bits1.s4 ) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s0; \ + total_sums.s1 += ((((bits4.s1 & 0x00F0) >> 4) | (((bits1.s4 >> 1) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s1; \ + total_sums.s1 += ((((bits4.s1 & 0x0F00) >> 8) | (((bits1.s4 >> 2) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s2; \ + total_sums.s1 += ((((bits4.s1 & 0xF000) >> 12) | (((bits1.s4 >> 3) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s3; \ + total_sums.s1 += (((bits4.s3 & 0x000F) | (((bits1.s4 >> 4) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s4; \ + total_sums.s1 += ((((bits4.s3 & 0x00F0) >> 4) | (((bits1.s4 >> 5) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s5; \ + total_sums.s1 += ((((bits4.s3 & 0x0F00) >> 8) | (((bits1.s4 >> 6) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s6; \ + total_sums.s1 += ((((bits4.s3 & 0xF000) >> 12) | (((bits1.s4 >> 7) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s7; \ + shared_y = sub_group_broadcast(y, 1); \ + total_sums.s0 += (((bits4.s4 & 0x000F) | (((bits1.s1 ) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s0; \ + total_sums.s0 += ((((bits4.s4 & 0x00F0) >> 4) | (((bits1.s1 >> 1) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s1; \ + total_sums.s0 += ((((bits4.s4 & 0x0F00) >> 8) | (((bits1.s1 >> 2) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s2; \ + total_sums.s0 += ((((bits4.s4 & 0xF000) >> 12) | (((bits1.s1 >> 3) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s3; \ + total_sums.s0 += (((bits4.s6 & 0x000F) | (((bits1.s1 >> 4) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s4; \ + total_sums.s0 += ((((bits4.s6 & 0x00F0) >> 4) | (((bits1.s1 >> 5) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s5; \ + total_sums.s0 += ((((bits4.s6 & 0x0F00) >> 8) | (((bits1.s1 >> 6) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s6; \ + total_sums.s0 += ((((bits4.s6 & 0xF000) >> 12) | (((bits1.s1 >> 7) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s7; \ + total_sums.s1 += (((bits4.s5 & 0x000F) | (((bits1.s5 ) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s0; \ + total_sums.s1 += ((((bits4.s5 & 0x00F0) >> 4) | (((bits1.s5 >> 1) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s1; \ + total_sums.s1 += ((((bits4.s5 & 0x0F00) >> 8) | (((bits1.s5 >> 2) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s2; \ + total_sums.s1 += ((((bits4.s5 & 0xF000) >> 12) | (((bits1.s5 >> 3) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s3; \ + total_sums.s1 += (((bits4.s7 & 0x000F) | (((bits1.s5 >> 4) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s4; \ + total_sums.s1 += ((((bits4.s7 & 0x00F0) >> 4) | (((bits1.s5 >> 5) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s5; \ + total_sums.s1 += ((((bits4.s7 & 0x0F00) >> 8) | (((bits1.s5 >> 6) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s6; \ + total_sums.s1 += ((((bits4.s7 & 0xF000) >> 12) | (((bits1.s5 >> 7) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s7; \ + + +#define dequantizeBlockAccum_ns_q5_1_sgbroadcast_8_lo(total_sums, bits4, bits1, scale, minv, y) \ + shared_y = sub_group_broadcast(y, 2); \ + total_sums.s0 += (((bits4.s0 & 0x000F) | (((bits1.s2 ) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s0; \ + total_sums.s0 += ((((bits4.s0 & 0x00F0) >> 4) | (((bits1.s2 >> 1) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s1; \ + total_sums.s0 += ((((bits4.s0 & 0x0F00) >> 8) | (((bits1.s2 >> 2) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s2; \ + total_sums.s0 += ((((bits4.s0 & 0xF000) >> 12) | (((bits1.s2 >> 3) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s3; \ + total_sums.s0 += (((bits4.s2 & 0x000F) | (((bits1.s2 >> 4) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s4; \ + total_sums.s0 += ((((bits4.s2 & 0x00F0) >> 4) | (((bits1.s2 >> 5) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s5; \ + total_sums.s0 += ((((bits4.s2 & 0x0F00) >> 8) | (((bits1.s2 >> 6) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s6; \ + total_sums.s0 += ((((bits4.s2 & 0xF000) >> 12) | (((bits1.s2 >> 7) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s7; \ + total_sums.s1 += (((bits4.s1 & 0x000F) | (((bits1.s6 ) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s0; \ + total_sums.s1 += ((((bits4.s1 & 0x00F0) >> 4) | (((bits1.s6 >> 1) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s1; \ + total_sums.s1 += ((((bits4.s1 & 0x0F00) >> 8) | (((bits1.s6 >> 2) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s2; \ + total_sums.s1 += ((((bits4.s1 & 0xF000) >> 12) | (((bits1.s6 >> 3) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s3; \ + total_sums.s1 += (((bits4.s3 & 0x000F) | (((bits1.s6 >> 4) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s4; \ + total_sums.s1 += ((((bits4.s3 & 0x00F0) >> 4) | (((bits1.s6 >> 5) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s5; \ + total_sums.s1 += ((((bits4.s3 & 0x0F00) >> 8) | (((bits1.s6 >> 6) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s6; \ + total_sums.s1 += ((((bits4.s3 & 0xF000) >> 12) | (((bits1.s6 >> 7) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s7; \ + shared_y = sub_group_broadcast(y, 3); \ + total_sums.s0 += (((bits4.s4 & 0x000F) | (((bits1.s3 ) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s0; \ + total_sums.s0 += ((((bits4.s4 & 0x00F0) >> 4) | (((bits1.s3 >> 1) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s1; \ + total_sums.s0 += ((((bits4.s4 & 0x0F00) >> 8) | (((bits1.s3 >> 2) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s2; \ + total_sums.s0 += ((((bits4.s4 & 0xF000) >> 12) | (((bits1.s3 >> 3) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s3; \ + total_sums.s0 += (((bits4.s6 & 0x000F) | (((bits1.s3 >> 4) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s4; \ + total_sums.s0 += ((((bits4.s6 & 0x00F0) >> 4) | (((bits1.s3 >> 5) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s5; \ + total_sums.s0 += ((((bits4.s6 & 0x0F00) >> 8) | (((bits1.s3 >> 6) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s6; \ + total_sums.s0 += ((((bits4.s6 & 0xF000) >> 12) | (((bits1.s3 >> 7) & 0x01) << 4)) * scale.s0 + minv.s0) * shared_y.s7; \ + total_sums.s1 += (((bits4.s5 & 0x000F) | (((bits1.s7 ) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s0; \ + total_sums.s1 += ((((bits4.s5 & 0x00F0) >> 4) | (((bits1.s7 >> 1) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s1; \ + total_sums.s1 += ((((bits4.s5 & 0x0F00) >> 8) | (((bits1.s7 >> 2) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s2; \ + total_sums.s1 += ((((bits4.s5 & 0xF000) >> 12) | (((bits1.s7 >> 3) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s3; \ + total_sums.s1 += (((bits4.s7 & 0x000F) | (((bits1.s7 >> 4) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s4; \ + total_sums.s1 += ((((bits4.s7 & 0x00F0) >> 4) | (((bits1.s7 >> 5) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s5; \ + total_sums.s1 += ((((bits4.s7 & 0x0F00) >> 8) | (((bits1.s7 >> 6) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s6; \ + total_sums.s1 += ((((bits4.s7 & 0xF000) >> 12) | (((bits1.s7 >> 7) & 0x01) << 4)) * scale.s1 + minv.s1) * shared_y.s7; \ + +#ifdef ADRENO_GPU +REQD_SUBGROUP_SIZE_64 +#endif +__kernel void kernel_gemv_noshuffle_q5_1_f32( + __read_only image1d_buffer_t src0_qs, // quantized A + global ushort * src0_qh, // 5th bits + global half2 * src0_d, // A scales + global half2 * src0_m, // A mins + __read_only image1d_buffer_t src1, // B activations + global float * dst, + ulong offsetd, + int ne00, // K + int ne01) // M +{ + uint groupId = get_local_id(1); + uint gid = get_global_id(0); + ushort slid = get_sub_group_local_id(); + + uint K = ne00; + uint M = ne01; + + uint LINE_STRIDE_A = M / 2; + uint BLOCK_STRIDE_A = NSUBGROUPS * M; + + __private uint4 regA; + __private half2 regS; + __private half2 regM; + __private float8 regB; + + __private float2 totalSum = (float2)(0.0f); + + for (uint k = groupId; k < (K / QK5_1); k += NSUBGROUPS) { + regS = src0_d[gid + k * LINE_STRIDE_A]; + regM = src0_m[gid + k * LINE_STRIDE_A]; + + ushort4 qh_raw; + qh_raw.s0 = src0_qh[gid + (4*k + 0) * LINE_STRIDE_A]; + qh_raw.s1 = src0_qh[gid + (4*k + 1) * LINE_STRIDE_A]; + qh_raw.s2 = src0_qh[gid + (4*k + 2) * LINE_STRIDE_A]; + qh_raw.s3 = src0_qh[gid + (4*k + 3) * LINE_STRIDE_A]; + + uchar8 raw = as_uchar8(qh_raw); + uchar8 qh_bytes = (uchar8)(raw.s0, raw.s2, raw.s4, raw.s6, + raw.s1, raw.s3, raw.s5, raw.s7); + + // Load activations + if (slid < 4) { + regB.s0123 = read_imagef(src1, (slid * 2 + k * 8)); + regB.s4567 = read_imagef(src1, (1 + slid * 2 + k * 8)); + } + + regA.s0 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 0)).x; + regA.s1 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 1)).x; + regA.s2 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 2)).x; + regA.s3 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 3)).x; + +#ifdef VECTOR_SUB_GROUP_BROADCAST + dequantizeBlockAccum_ns_q5_1_sgbroadcast_8_hi(totalSum, as_ushort8(regA), qh_bytes, regS, regM, regB); +#else + dequantizeBlockAccum_ns_q5_1_sgbroadcast_1_hi(totalSum, as_ushort8(regA), qh_bytes, regS, regM, regB); +#endif // VECTOR_SUB_GROUP_BROADCAST + + regA.s0 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 4)).x; + regA.s1 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 5)).x; + regA.s2 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 6)).x; + regA.s3 = read_imageui(src0_qs, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 7)).x; +#ifdef VECTOR_SUB_GROUP_BROADCAST + dequantizeBlockAccum_ns_q5_1_sgbroadcast_8_lo(totalSum, as_ushort8(regA), qh_bytes, regS, regM, regB); +#else + dequantizeBlockAccum_ns_q5_1_sgbroadcast_1_lo(totalSum, as_ushort8(regA), qh_bytes, regS, regM, regB); +#endif // VECTOR_SUB_GROUP_BROADCAST + } + + // reduction in local memory, assumes #wave=4 + local float2 reduceLM[SUBGROUP_SIZE * 3]; + if (groupId == 1) { + reduceLM[SUBGROUP_SIZE * 0 + slid] = totalSum; + } + if (groupId == 2) { + reduceLM[SUBGROUP_SIZE * 1 + slid] = totalSum; + } + if (groupId == 3) { + reduceLM[SUBGROUP_SIZE * 2 + slid] = totalSum; + } + + barrier(CLK_LOCAL_MEM_FENCE); + + if (groupId == 0) { + totalSum += reduceLM[SUBGROUP_SIZE * 0 + slid]; + } + if (groupId == 0) { + totalSum += reduceLM[SUBGROUP_SIZE * 1 + slid]; + } + if (groupId == 0) { + totalSum += reduceLM[SUBGROUP_SIZE * 2 + slid]; + } + + // 2 outputs per fiber in wave 0 + if (groupId == 0) { + dst = (global float*)((global char*)dst + offsetd); + vstore2(totalSum, 0, &(dst[gid * 2])); + } + +} diff --git a/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q8_0_f32.cl b/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q8_0_f32.cl index 9703b693e567..f5c6fb3e8437 100644 --- a/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q8_0_f32.cl +++ b/ggml/src/ggml-opencl/kernels/gemv_noshuffle_q8_0_f32.cl @@ -174,7 +174,7 @@ __kernel void kernel_gemv_noshuffle_q8_0_f32( regA.s6 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 6)).x; regA.s7 = read_imageui(src0_q, (gid + k * BLOCK_STRIDE_A + LINE_STRIDE_A * 7)).x; - dequantizeBlockAccum_ns_sgbroadcast_1(totalSum, regA, regS, regB); + dequantizeBlockAccum_ns_sgbroadcast_1(totalSum, regA, convert_float(regS), regB); } // reduction in local memory, assumes #wave=4 diff --git a/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_l4.cl b/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_l4.cl index cdf8197c4705..a639ec664b13 100644 --- a/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_l4.cl +++ b/ggml/src/ggml-opencl/kernels/mul_mv_f16_f32_l4.cl @@ -82,3 +82,299 @@ kernel void kernel_mul_mat_f16_f32_l4( } } } + +// Each subgroup produces DR_NDST outputs, assumes ne11 == 1 +#define MUL_MAT_F16_F32_L4_DR_NDST 4 + +#ifdef ADRENO_GPU +REQD_SUBGROUP_SIZE_64 +#endif +kernel void kernel_mul_mat_f16_f32_l4_dr( + global char * src0, + ulong offset0, + global char * src1, + ulong offset1, + global float * dst, + ulong offsetd, + int ne00, + int ne01, + int ne02, + ulong nb00, + ulong nb01, + ulong nb02, + ulong nb03, + int ne10, + int ne11, + int ne12, + ulong nb10, + ulong nb11, + ulong nb12, + ulong nb13, + int ne0, + int ne1, + int r2, + int r3 +) { + src0 = (global char*)((global char*)src0 + offset0); + src1 = (global char*)((global char*)src1 + offset1); + dst = (global float*)((global char*)dst + offsetd); + + const int r0_base = get_group_id(0) * MUL_MAT_F16_F32_L4_DR_NDST; + const int im = get_group_id(2); + + const int i12 = im % ne12; + const int i13 = im / ne12; + + // assume ne11 == 1 + const ulong offset_src1 = i12*nb12 + i13*nb13; + global float4 * y4 = (global float4 *)(src1 + offset_src1); + + global half4 * x4[MUL_MAT_F16_F32_L4_DR_NDST]; + float sumf[MUL_MAT_F16_F32_L4_DR_NDST]; + + const ulong k_head_off = (i12/r2)*nb02 + (i13/r3)*nb03; + + #pragma unroll + for (int n = 0; n < MUL_MAT_F16_F32_L4_DR_NDST; ++n) { + int r0 = r0_base + n; + int r0c = r0 < ne01 ? r0 : 0; + ulong off = (ulong)r0c*nb01 + k_head_off; + x4[n] = (global half4 *)(src0 + off); + sumf[n] = 0.0f; + } + + const int n_chunks = ne00 / 4; + const int sg_size = get_max_sub_group_size(); + const int lid = get_sub_group_local_id(); + + for (int i = lid; i < n_chunks; i += sg_size) { + float4 q = y4[i]; + #pragma unroll + for (int n = 0; n < MUL_MAT_F16_F32_L4_DR_NDST; ++n) { + float4 k = convert_float4(x4[n][i]); + sumf[n] = mad(k.s0, q.s0, sumf[n]); + sumf[n] = mad(k.s1, q.s1, sumf[n]); + sumf[n] = mad(k.s2, q.s2, sumf[n]); + sumf[n] = mad(k.s3, q.s3, sumf[n]); + } + } + + #pragma unroll + for (int n = 0; n < MUL_MAT_F16_F32_L4_DR_NDST; ++n) { + float reduced = sub_group_reduce_add(sumf[n]); + int r0 = r0_base + n; + if (lid == 0 && r0 < ne01) { + dst[im*ne1*ne0 + r0] = reduced; + } + } +} + +// Kernels for decoding, Adreno only for now +#define MUL_MAT_F16_F32_L4_DR_LS_R2_MAX 8 + +#ifdef ADRENO_GPU +#pragma OPENCL EXTENSION cl_qcom_subgroup_shuffle : enable +#define sub_group_shuffle_xor(val, mask) qcom_sub_group_shuffle_xor((val), (mask), CLK_SUB_GROUP_SHUFFLE_WIDTH_WAVE_SIZE_QCOM, 0.0f) + +REQD_SUBGROUP_SIZE_64 +kernel void kernel_mul_mat_f16_f32_l4_dr_ls( + global char * src0, + ulong offset0, + global char * src1, + ulong offset1, + global float * dst, + ulong offsetd, + int ne00, + int ne01, + int ne02, + ulong nb00, + ulong nb01, + ulong nb02, + ulong nb03, + int ne10, + int ne11, + int ne12, + ulong nb10, + ulong nb11, + ulong nb12, + ulong nb13, + int ne0, + int ne1, + int r2, + int r3 +) { + src0 = (global char*)((global char*)src0 + offset0); + src1 = (global char*)((global char*)src1 + offset1); + dst = (global float*)((global char*)dst + offsetd); + + const int r0_base = get_group_id(0) * 2; + const int kv_grp = get_group_id(2); // KV head group; im = kv_grp*r2 + q + + const int i12_kv = kv_grp % ne02; + const int i13_kv = kv_grp / ne02; + + const int lid = get_sub_group_local_id(); + const int subhalf = lid >> 5; // 0 or 1 (which K row in the WG) + const int intra = lid & 31; // 0..31 (lane within the half) + + const int r0 = r0_base + subhalf; + const int r0c = r0 < ne01 ? r0 : 0; // clamp OOB to row 0; skip write below + + // K row pointer for this lane (one K row per half-wave). + const ulong k_off = (ulong)r0c*nb01 + (ulong)i12_kv*nb02 + (ulong)i13_kv*nb03; + global half4 * x4 = (global half4 *)(src0 + k_off); + + global float4 * y4[MUL_MAT_F16_F32_L4_DR_LS_R2_MAX]; + #pragma unroll + for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) { + const int i12_q = i12_kv*r2 + q; + const ulong q_off = (ulong)i12_q*nb12 + (ulong)i13_kv*nb13; + y4[q] = (global float4 *)(src1 + q_off); + } + + float partial[MUL_MAT_F16_F32_L4_DR_LS_R2_MAX]; + #pragma unroll + for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) { + partial[q] = 0.0f; + } + + const int n_chunks = ne00 / 4; + + for (int i = intra; i < n_chunks; i += 32) { + float4 k = convert_float4(x4[i]); + + #pragma unroll + for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) { + if (q < r2) { + float4 v = y4[q][i]; + partial[q] = mad(k.s0, v.s0, partial[q]); + partial[q] = mad(k.s1, v.s1, partial[q]); + partial[q] = mad(k.s2, v.s2, partial[q]); + partial[q] = mad(k.s3, v.s3, partial[q]); + } + } + } + + // half-wave reduction + #pragma unroll + for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) { + if (q < r2) { + partial[q] += sub_group_shuffle_xor(partial[q], 1u); + partial[q] += sub_group_shuffle_xor(partial[q], 2u); + partial[q] += sub_group_shuffle_xor(partial[q], 4u); + partial[q] += sub_group_shuffle_xor(partial[q], 8u); + partial[q] += sub_group_shuffle_xor(partial[q], 16u); + } + } + + if (intra == 0 && r0 < ne01) { + #pragma unroll + for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) { + if (q < r2) { + const int im = i12_kv*r2 + q + i13_kv*ne12; + dst[im*ne1*ne0 + r0] = partial[q]; + } + } + } +} + +REQD_SUBGROUP_SIZE_64 +kernel void kernel_mul_mat_f16_f32_l4_dr_lq( + global char * src0, + ulong offset0, + global char * src1, + ulong offset1, + global float * dst, + ulong offsetd, + int ne00, + int ne01, + int ne02, + ulong nb00, + ulong nb01, + ulong nb02, + ulong nb03, + int ne10, + int ne11, + int ne12, + ulong nb10, + ulong nb11, + ulong nb12, + ulong nb13, + int ne0, + int ne1, + int r2, + int r3 +) { + src0 = (global char*)((global char*)src0 + offset0); + src1 = (global char*)((global char*)src1 + offset1); + dst = (global float*)((global char*)dst + offsetd); + + const int r0_base = get_group_id(0) * 4; + const int kv_grp = get_group_id(2); + + const int i12_kv = kv_grp % ne02; + const int i13_kv = kv_grp / ne02; + + const int lid = get_sub_group_local_id(); + const int subq = lid >> 4; // 0..3 (which K row) + const int intra = lid & 15; // 0..15 (lane within quarter) + + const int r0 = r0_base + subq; + const int r0c = r0 < ne01 ? r0 : 0; + + const ulong k_off = (ulong)r0c*nb01 + (ulong)i12_kv*nb02 + (ulong)i13_kv*nb03; + global half4 * x4 = (global half4 *)(src0 + k_off); + + global float4 * y4[MUL_MAT_F16_F32_L4_DR_LS_R2_MAX]; + #pragma unroll + for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) { + const int i12_q = i12_kv*r2 + q; + const ulong q_off = (ulong)i12_q*nb12 + (ulong)i13_kv*nb13; + y4[q] = (global float4 *)(src1 + q_off); + } + + float partial[MUL_MAT_F16_F32_L4_DR_LS_R2_MAX]; + #pragma unroll + for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) { + partial[q] = 0.0f; + } + + const int n_chunks = ne00 / 4; + + for (int i = intra; i < n_chunks; i += 16) { + float4 k = convert_float4(x4[i]); + + #pragma unroll + for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) { + if (q < r2) { + float4 v = y4[q][i]; + partial[q] = mad(k.s0, v.s0, partial[q]); + partial[q] = mad(k.s1, v.s1, partial[q]); + partial[q] = mad(k.s2, v.s2, partial[q]); + partial[q] = mad(k.s3, v.s3, partial[q]); + } + } + } + + // quarter-wave reduction + #pragma unroll + for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) { + if (q < r2) { + partial[q] += sub_group_shuffle_xor(partial[q], 1u); + partial[q] += sub_group_shuffle_xor(partial[q], 2u); + partial[q] += sub_group_shuffle_xor(partial[q], 4u); + partial[q] += sub_group_shuffle_xor(partial[q], 8u); + } + } + + if (intra == 0 && r0 < ne01) { + #pragma unroll + for (int q = 0; q < MUL_MAT_F16_F32_L4_DR_LS_R2_MAX; ++q) { + if (q < r2) { + const int im = i12_kv*r2 + q + i13_kv*ne12; + dst[im*ne1*ne0 + r0] = partial[q]; + } + } + } +} +#endif // ADRENO_GPU diff --git a/ggml/src/ggml-opencl/kernels/norm.cl b/ggml/src/ggml-opencl/kernels/norm.cl index 170f822787be..a5ccac241376 100644 --- a/ggml/src/ggml-opencl/kernels/norm.cl +++ b/ggml/src/ggml-opencl/kernels/norm.cl @@ -24,6 +24,7 @@ kernel void kernel_norm( int ne01, int ne02, int ne03, + ulong nb00, ulong nb01, ulong nb02, ulong nb03, @@ -43,7 +44,8 @@ kernel void kernel_norm( // parallel sum sum[get_local_id(0)] = 0.0f; for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) { - sum[get_local_id(0)] += x[i00]; + // this kernel handles float, nb00/4 translates byte offset to element offset + sum[get_local_id(0)] += x[i00*nb00/4]; } // reduce barrier(CLK_LOCAL_MEM_FENCE); @@ -60,7 +62,8 @@ kernel void kernel_norm( global float * y = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; sum[get_local_id(0)] = 0.0f; for (int i00 = get_local_id(0); i00 < ne00; i00 += get_local_size(0)) { - y[i00] = x[i00] - mean; + // this kernel handles float, nb00/4 translates byte offset to element offset + y[i00] = x[i00*nb00/4] - mean; sum[get_local_id(0)] += y[i00] * y[i00]; } diff --git a/ggml/src/ggml-openvino/.clang-format b/ggml/src/ggml-openvino/.clang-format index a2a24d7d33a0..4a5c7c208676 100644 --- a/ggml/src/ggml-openvino/.clang-format +++ b/ggml/src/ggml-openvino/.clang-format @@ -2,12 +2,7 @@ # Override root .clang-format AlignConsecutiveAssignments: false AlignConsecutiveDeclarations: false -Cpp11BracedListStyle: true -SpacesInContainerLiterals: false -BreakBeforeBraces: Attach AccessModifierOffset: -4 -IndentCaseBlocks: false -IndentCaseLabels: false Language: Cpp AlignAfterOpenBracket: Align diff --git a/ggml/src/ggml-openvino/CMakeLists.txt b/ggml/src/ggml-openvino/CMakeLists.txt index 175b585661d3..cc089b721fc3 100644 --- a/ggml/src/ggml-openvino/CMakeLists.txt +++ b/ggml/src/ggml-openvino/CMakeLists.txt @@ -1,8 +1,6 @@ -find_package(OpenVINO REQUIRED) +find_package(OpenVINO REQUIRED COMPONENTS Runtime Threading) find_package(OpenCL REQUIRED) -include("${OpenVINO_DIR}/../3rdparty/tbb/lib/cmake/TBB/TBBConfig.cmake") - file(GLOB_RECURSE GGML_HEADERS_OPENVINO "*.h" "*.hpp") file(GLOB_RECURSE GGML_SOURCES_OPENVINO "*.cpp") @@ -11,7 +9,7 @@ ggml_add_backend_library(ggml-openvino ${GGML_HEADERS_OPENVINO} ) -target_link_libraries(ggml-openvino PRIVATE openvino::runtime TBB::tbb OpenCL::OpenCL) +target_link_libraries(ggml-openvino PRIVATE openvino::runtime openvino::threading OpenCL::OpenCL) if (GGML_OPENVINO) if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64") diff --git a/ggml/src/ggml-openvino/ggml-decoder.cpp b/ggml/src/ggml-openvino/ggml-decoder.cpp index 5095e7998493..48c63e4d70fa 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.cpp +++ b/ggml/src/ggml-openvino/ggml-decoder.cpp @@ -1,20 +1,17 @@ #include "ggml-decoder.h" -#include "ggml-backend-impl.h" -#include "ggml-backend.h" +#include "ggml-impl.h" #include "ggml-openvino-extra.h" #include "ggml-openvino.h" #include "ggml-quants.h" - -#include -#include +#include "ggml.h" +#include "utils.h" #include #include #include #include #include -#include #include #include #include @@ -30,12 +27,10 @@ #include #include #include -#include #include #include #include #include -#include #include GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, @@ -44,6 +39,7 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, std::map> & model_weights, bool is_static, bool is_stateful, + bool model_is_splitted, bool is_prefill, int prefill_chunk_size) : m_is_static(is_static), @@ -51,22 +47,23 @@ GgmlOvDecoder::GgmlOvDecoder(ggml_cgraph * cgraph, m_is_prefill(is_prefill), m_naive(false), m_prefill_chunk_size(prefill_chunk_size), + m_model_is_splitted(model_is_splitted), m_cgraph(cgraph), m_model_weights(model_weights), m_model_params(model_params), m_compute_params(compute_params) { - if (auto * env = getenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS"); env && std::string(env) != "0") { -#ifdef _WIN32 - _putenv_s("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS", ""); -#else - unsetenv("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS"); -#endif - print_tensor_address_map(cgraph); + static bool printed_address_map = false; + if (!printed_address_map) { + if (ggml_openvino_getenv_int("GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS")) { + printed_address_map = true; + print_tensor_address_map(cgraph); + } } validate_cgraph(); set_input_output(); + compute_node_dynamic_dims(); compute_model_inputs(); compute_model_outputs(); @@ -136,6 +133,29 @@ void GgmlOvDecoder::set_input_output() { } current_node_info.node_inputs[src_name] = src; current_node_info.node_inputs_names.push_back(src_name); + + if (src->op == GGML_OP_VIEW) { + // Traverse upward through nested VIEW operations + std::remove_reference_t view_chain; + auto current = src; + + while (current != nullptr) { + auto current_name = std::string(current->name); + if (current->flags & GGML_TENSOR_FLAG_INPUT) { + current_name = get_graph_input_ov_name(current, node); + } + view_chain.emplace_back(current_name, current); + // If current src is also a VIEW, continue traversing + if (current->src[0] != nullptr && current->src[0]->op == GGML_OP_VIEW) { + current = current->src[0]; + } else { + break; + } + } + + // Assign all collected view inputs to node_inputs_views + current_node_info.node_inputs_views[src_name] = view_chain; + } } m_node_info_list.push_back(current_node_info); @@ -156,20 +176,13 @@ int GgmlOvDecoder::compute_op_case(const ggml_tensor * node) const { if (src->ne[2] * src->ne[3] == node->ne[1]) { op_case = 5; } - } else if (src->ne[0] * src->ne[1] == node->ne[1]) { + } else if (src->ne[0] * src->ne[1] * src->ne[2] == node->ne[1]) { op_case = 3; } else if (src->ne[1] * src->ne[2] == node->ne[1]) { op_case = 6; } - break; - } - case GGML_OP_CONT: { - if (node->src[0]->op == GGML_OP_PERMUTE) { - op_case = 1; - } else if (node->src[0]->op == GGML_OP_TRANSPOSE) { - op_case = 2; - } else if (node->src[0]->op == GGML_OP_VIEW) { - op_case = 3; + if (op_case == 0 && ggml_nelements(node) == ggml_nelements(src)) { + op_case = 6; } break; } @@ -179,23 +192,41 @@ int GgmlOvDecoder::compute_op_case(const ggml_tensor * node) const { } else if (node->src[0]->src[0]->op == GGML_OP_NONE) { // kv cache tensor std::string src_name(node->view_src->name); - int layer = extract_layer_from_name(src_name); - if (!is_swa_layer(layer)) { - op_case = 2; + int layer = extract_layer_from_name(src_name).value(); + if (ggml_is_contiguous(node->src[0])) { + // - 19: [ 64, 8, 256, 1] VIEW cache_k_l0 (view) [ 2, 128, 1024, 1048576] + // [ 512, 1024, 1, 1] 0: NONE cache_k_l0 [ 2, 1024, 1048576, 1048576] + // - 20: [ 64, 256, 8, 1] PERMUTE cache_k_l0 (view) (permuted) [ 2, 1024, 128, 1048576] + // [ 64, 8, 256, 1] 0: VIEW cache_k_l0 (view) [ 2, 128, 1024, 1048576] + if (!is_swa_layer(layer)) { + op_case = 3; + } else { + op_case = 4; + } } else { - op_case = 3; + // special case of cache v when `-fa off` + // - 17: [ 256, 8, 64, 1] VIEW cache_v_l0 (view) [ 2, 131072, 2048, 1048576] + // [ 512, 1024, 1, 1] 0: NONE cache_v_l0 [ 2, 1024, 1048576, 1048576] + // - 18: [ 256, 64, 8, 1] PERMUTE cache_v_l0 (view) (permuted) [ 2, 2048, 131072, 1048576] + // [ 256, 8, 64, 1] 0: VIEW cache_v_l0 (view) [ 2, 131072, 2048, 1048576] + if (!is_swa_layer(layer)) { + op_case = 5; + } else { + op_case = 6; + } } } else { // rope'ed query tensor - op_case = 4; + op_case = 2; } break; } case GGML_OP_MUL_MAT: { - if (node->src[0]->op == GGML_OP_CONT && node->src[0]->src[0]->op == GGML_OP_TRANSPOSE) { - op_case = 2; - } else if (node->src[0]->op == GGML_OP_VIEW && node->src[1]->op == GGML_OP_VIEW) { + if (node->src[0]->op == GGML_OP_VIEW && node->src[1]->op == GGML_OP_VIEW) { op_case = 3; + } else if (node->src[1]->op == GGML_OP_SOFT_MAX) { + // In the case of `-fa off`, softmax is used, v_trans=true, the dynamic dim is ne[0] for cache_v + op_case = 2; } break; } @@ -208,43 +239,57 @@ int GgmlOvDecoder::compute_op_case(const ggml_tensor * node) const { case GGML_OP_ROPE: { const int mode = node->op_params[2]; switch (mode) { - case GGML_ROPE_TYPE_NEOX: { - op_case = 0x00010000; + case GGML_ROPE_TYPE_NEOX: { + op_case = 1; break; } - case GGML_ROPE_TYPE_IMROPE: { - op_case = 0x00020000; + case GGML_ROPE_TYPE_IMROPE: { + op_case = 2; break; } default: - op_case = 0x00000000; + op_case = 0; break; } - if (node->src[0]->op == GGML_OP_VIEW) { - op_case = (op_case | 0x00000002); - } break; } case GGML_OP_VIEW: { if (node->src[0]->op == GGML_OP_VIEW) { auto * src = node->src[0]; if (ggml_nelements(node) != ggml_nelements(src)) { - throw std::runtime_error("Unsupported VIEW case"); + // throw std::runtime_error("Unsupported VIEW case"); + } + op_case = 0; + if (m_model_is_splitted && m_model_inputs.find(std::string(src->name)) != m_model_inputs.end()) { + op_case = 0; } - op_case = 2; } { auto * src = node->src[0]; - if ((ggml_nelements(node) != ggml_nelements(src)) && m_naive) { - // Compare each dimension of node and src, if only one dimension differs then op_case=3 + if (ggml_nelements(node) != ggml_nelements(src)) { + // Case 4: select one slice on src dim1 (via view offset), keep src dim2 as output dim1. + // Typical pattern: + // src: ne=[N, M, K, 1], nb=[b0, b1, b2, b3] + // dst: ne=[N, K, 1, 1], nb=[b0, b2, b3, b3] + if (node->ne[0] == src->ne[0] && node->ne[1] == src->ne[2] && node->ne[2] == 1 && + node->nb[0] == src->nb[0] && node->nb[1] == src->nb[2] && src->ne[1] > 1) { + op_case = 0; + break; + } + + // General case 3: shape differs from source (one or more dims) and is handled as VIEW slicing. int diff_count = 0; for (int i = 0; i < GGML_MAX_DIMS; i++) { if (node->ne[i] != src->ne[i]) { diff_count++; } + // if node ne[i] > src ne[i], case = 0 + if (node->ne[i] > src->ne[i]) { + return 0; + } } - if (diff_count == 1) { - op_case = 3; + if (diff_count >= 1) { + op_case = 0; } } } @@ -256,9 +301,11 @@ int GgmlOvDecoder::compute_op_case(const ggml_tensor * node) const { return op_case; } -int extract_layer_from_name(const std::string & name) { +std::optional extract_layer_from_name(const std::string & name) { size_t pos1 = name.find("_l"); - assert(pos1 != std::string::npos); + if (pos1 == std::string::npos) { + return std::nullopt; + } pos1 += 2; size_t pos2 = name.find(' ', pos1); if (pos2 == std::string::npos) { @@ -272,26 +319,101 @@ int extract_layer_from_name(const std::string & name) { std::pair GgmlOvDecoder::compute_llm_params(ggml_cgraph * cgraph, bool is_static) { ModelParams model_params; ComputeParams compute_params; + auto get_attention_pattern_case = [](const ggml_tensor * node) -> int { + if (node == nullptr) { + return -1; + } + + switch (node->op) { + case GGML_OP_FLASH_ATTN_EXT: + if (node->src[0] == nullptr || node->src[1] == nullptr || node->src[3] == nullptr) { + return -1; + } + switch (node->src[1]->op) { + case GGML_OP_PERMUTE: + // case 0: node op is FLASH_ATTN_EXT, src 1 not null & op is PERMUTE & the permuted tensor src is the view of cache k + if (node->src[1]->src[0] != nullptr && node->src[1]->src[0]->op == GGML_OP_VIEW) { + return 0; + } + break; + case GGML_OP_CPY: + // case 1: node op is FLASH_ATTN_EXT, src 1 not null & op is CPY & the copied tensor src is PERMUTE & the permuted tensor src is the view of cache k + if (node->src[1]->src[0] != nullptr && node->src[1]->src[0]->op == GGML_OP_PERMUTE && + node->src[1]->src[0]->src[0] != nullptr && node->src[1]->src[0]->src[0]->op == GGML_OP_VIEW) { + return 1; + } + break; + default: + break; + } + break; + case GGML_OP_SOFT_MAX: + // case 2: node op is SOFT_MAX, src 0 not null & op is MUL_MAT & the src 0 of MUL_MAT is PERMUTE & the permuted tensor src is the view of cache k + if (node->src[0] != nullptr && node->src[1] != nullptr && node->src[0]->op == GGML_OP_MUL_MAT && + node->src[0]->src[0] != nullptr && node->src[0]->src[1] != nullptr && + node->src[0]->src[0]->op == GGML_OP_PERMUTE && node->src[0]->src[0]->src[0] != nullptr && + node->src[0]->src[0]->src[0]->op == GGML_OP_VIEW) { + return 2; + } + // case 3: node op is SOFT_MAX, src 0 not null & op is ADD & the src 0 of ADD is MUL_MAT & the src 0 of MUL_MAT is PERMUTE + if (node->src[0]->op == GGML_OP_ADD && node->src[0]->src[0] != nullptr && + node->src[0]->src[0]->op == GGML_OP_MUL_MAT && node->src[0]->src[0]->src[0] != nullptr && + node->src[0]->src[0]->src[0]->op == GGML_OP_PERMUTE) { + return 3; + } + break; + default: + break; + } + + return -1; + }; + + bool rope_seen = false; for (int i = 0; i < cgraph->n_nodes; i++) { auto * node = cgraph->nodes[i]; std::string name = std::string(node->name); - if (node->op == GGML_OP_FLASH_ATTN_EXT) { - model_params.n_heads = node->src[0]->ne[2]; - model_params.n_heads_kv = node->src[1]->ne[2]; - model_params.head_size = node->src[0]->ne[0]; + const int attention_pattern_case = get_attention_pattern_case(node); + if (attention_pattern_case != -1) { + ggml_tensor * cache_k_permute = nullptr; + ggml_tensor * mask = nullptr; + + switch (attention_pattern_case) { + case 0: + cache_k_permute = node->src[1]; + mask = node->src[3]; + break; + case 1: + cache_k_permute = node->src[1]->src[0]; + mask = node->src[3]; + break; + case 2: + cache_k_permute = node->src[0]->src[0]; + mask = node->src[1]; + break; + case 3: + cache_k_permute = node->src[0]->src[0]->src[0]; + mask = node->src[1]; + break; + default: + break; + } + + assert(cache_k_permute != nullptr); + + model_params.head_size = cache_k_permute->ne[0]; + model_params.n_heads_kv = cache_k_permute->ne[2]; compute_params.input_len = node->src[0]->ne[1]; + compute_params.token_len_per_seq = node->src[0]->ne[1]; - auto * cache_k_perm = node->src[1]; - if (cache_k_perm->op == GGML_OP_CPY) { - cache_k_perm = cache_k_perm->src[0]; + auto * cache_k_view = cache_k_permute->src[0]; + if (cache_k_view->op != GGML_OP_VIEW || mask == nullptr) { + continue; } - assert(cache_k_perm->op == GGML_OP_PERMUTE); - auto * cache_k_view = cache_k_perm->src[0]; - assert(cache_k_view->op == GGML_OP_VIEW); - auto * cache_k = cache_k_view->src[0]; - int layer = extract_layer_from_name(cache_k->name); - auto * mask = node->src[3]; + ggml_tensor * cache_k = cache_k_view->src[0]; + int layer = extract_layer_from_name(cache_k->name).value(); + std::string mask_name(mask->name); model_params.kv_buffer_ctx_id = ggml_backend_openvino_buffer_get_ctx_id(cache_k->buffer); @@ -308,7 +430,6 @@ std::pair GgmlOvDecoder::compute_llm_params(ggml_cgr size_t offset; memcpy(&offset, cache_k_view->op_params, sizeof(size_t)); compute_params.seq_active_start = offset / seq_size; - compute_params.token_len_per_seq = node->ne[2]; if (mask_name.find("swa") != std::string::npos) { compute_params.attention_size_swa = mask->ne[0]; @@ -320,10 +441,40 @@ std::pair GgmlOvDecoder::compute_llm_params(ggml_cgr compute_params.attention_size_swa = model_params.ctx_per_seq_swa; compute_params.token_len_per_seq = 1; } - break; + } + + if (node->op == GGML_OP_MUL_MAT && node->src[0]->op == GGML_OP_PERMUTE && + node->src[0]->src[0]->op == GGML_OP_VIEW && is_kvcache(node->src[0]->view_src, node->view_src)) { + if (node->src[1]->op == GGML_OP_PERMUTE && node->src[1]->src[0]->op == GGML_OP_VIEW && + node->src[1]->src[0]->src[0]->op == GGML_OP_ROPE) { + compute_params.attention_size = node->ne[0]; + } + } + + // if the node op is TRANSPOSE and its input is PERMUTE and the source of the PERMUTE is VIEW, then get the attention size with the TRANSPOSE node ne[0] (in case no GGML_OP_FLASH_ATTN_EXT) + if (node->op == GGML_OP_TRANSPOSE && node->src[0]->op == GGML_OP_PERMUTE && + node->src[0]->src[0]->op == GGML_OP_VIEW) { + compute_params.attention_size = node->ne[0]; + if (is_static) { + compute_params.attention_size = model_params.ctx_per_seq; + } } if (node->op == GGML_OP_ROPE) { - memcpy(model_params.rope_params, node->op_params, sizeof(int32_t) * 15); + if (compute_params.token_len_per_seq == -1 && node->src[1] != nullptr) { + compute_params.token_len_per_seq = ggml_nelements(node->src[1]); + } + + // When multiple ROPE ops in the graph disagree on op_params (e.g. gemma4's + // mixed SWA/non-SWA layers with different n_dims or freq_base), we cannot + // share a single precomputed rope_sin/rope_cos. Track divergence so the + // translator falls back to per-op make_sin_cos in that case. + static_assert(sizeof(model_params.rope_params) == sizeof(int32_t) * 15, "rope_params size"); + if (!rope_seen) { + memcpy(model_params.rope_params, node->op_params, sizeof(int32_t) * 15); + rope_seen = true; + } else if (memcmp(model_params.rope_params, node->op_params, sizeof(int32_t) * 15) != 0) { + model_params.mixed_rope_params = true; + } } } auto * output_tensor = cgraph->nodes[cgraph->n_nodes - 1]; @@ -333,7 +484,6 @@ std::pair GgmlOvDecoder::compute_llm_params(ggml_cgr compute_params.output_len = 1; } model_params.ctx = model_params.ctx_per_seq * model_params.n_seq; - model_params.ctx_swa = model_params.ctx_per_seq_swa * model_params.n_seq; return {model_params, compute_params}; } @@ -343,9 +493,11 @@ void GgmlOvDecoder::validate_cgraph() const { } } -ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input) const { +ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, + const ggml_tensor * input, + int dynamic_dim_index) const { if (m_naive) { - return input!= nullptr ? ov::PartialShape{get_shape(input)} : ov::PartialShape{get_shape(op)}; + return input != nullptr ? ov::PartialShape{get_shape(input)} : ov::PartialShape{get_shape(op)}; } auto name = std::string(input->name); ov::PartialShape input_shape; @@ -394,6 +546,15 @@ ov::PartialShape GgmlOvDecoder::get_graph_input_shape(const ggml_tensor * op, co } else { input_shape = ov::PartialShape{get_shape(input)}; } + if (dynamic_dim_index != -1 && m_model_is_splitted) { + input_shape[3 - dynamic_dim_index] = -1; + } + if (op->op == GGML_OP_SOFT_MAX && op->src[1] != nullptr && op->src[1]->op == GGML_OP_NONE && + op->src[1]->flags & GGML_TENSOR_FLAG_INPUT && op->src[1] == input) { + // for softmax input mask, the shape is [1, 1, seq_active, seq_active], where seq_active is determined by the input active sequence length instead of the kv cache sequence length + input_shape[2] = -1; + input_shape[3] = -1; + } return input_shape; } @@ -421,15 +582,19 @@ void GgmlOvDecoder::add_extra_inputs() { } }; - create_1d_input("attention_size", m_compute_params.attention_size); + if (m_compute_params.attention_size != -1) { + create_1d_input("attention_size", m_compute_params.attention_size); + } if (m_compute_params.attention_size_swa != -1) { create_1d_input("attention_size_swa", m_compute_params.attention_size_swa); } create_1d_input("n_seq_active", m_compute_params.n_seq_active); create_1d_input("seq_active_start", m_compute_params.seq_active_start); create_1d_input("seq_active_end", m_compute_params.seq_active_start + m_compute_params.n_seq_active); - create_1d_input("token_len_per_seq", m_compute_params.token_len_per_seq); - // create_1d_input("token_len", m_token_len_per_seq * m_n_seq_active); + if (m_compute_params.token_len_per_seq != -1) { + create_1d_input("token_len_per_seq", m_compute_params.token_len_per_seq); + } + // create_1d_input("token_len", m_compute_params.token_len_per_seq * m_compute_params.n_seq_active); } bool GgmlOvDecoder::node_is_used_as_src(const int node_idx) { @@ -455,8 +620,8 @@ void GgmlOvDecoder::compute_model_inputs() { std::string node_name(node->name); if (m_model_weights.find(node_name) == m_model_weights.end()) { m_inputs[node_name] = node; - auto param_node = - std::make_shared(get_ov_type(node), get_graph_input_shape(node, nullptr)); + auto param_node = std::make_shared( + get_ov_type(node), get_graph_input_shape(node, nullptr, m_node_dynamic_dims[node])); param_node->set_friendly_name(node_name); param_node->output(0).get_tensor().set_names({node_name}); m_model_inputs[node_name] = param_node; @@ -500,7 +665,13 @@ void GgmlOvDecoder::compute_model_inputs() { m_model_params.kv_names.push_back(src_name); } } - ov::PartialShape param_shape = get_graph_input_shape(node, src); + // Resolve nested VIEW nodes by following src[0] until the first non-VIEW tensor. + while (src->op == GGML_OP_VIEW && src->src[0] != nullptr) { + src = src->src[0]; + src_name = std::string(src->name); + } + m_inputs[src_name] = src; + ov::PartialShape param_shape = get_graph_input_shape(node, src, m_node_dynamic_dims[src]); auto param_node = std::make_shared(get_ov_type(src), param_shape); param_node->set_friendly_name(src_name); param_node->output(0).get_tensor().set_names({src_name}); @@ -515,7 +686,7 @@ void GgmlOvDecoder::compute_model_outputs() { for (int node_n = 0; node_n < m_cgraph->n_nodes; node_n++) { auto * cur_node = m_cgraph->nodes[node_n]; // if the node op is NONE means this node is not used at all, we can skip it directly without adding to model outputs. - if (cur_node->op == GGML_OP_NONE) { + if (cur_node->op == GGML_OP_NONE || cur_node->op == GGML_OP_VIEW || cur_node->op == GGML_OP_RESHAPE) { continue; } auto cur_node_use_count = m_cgraph->use_counts[ggml_hash_find(&m_cgraph->visited_hash_set, cur_node)]; @@ -644,15 +815,26 @@ std::shared_ptr GgmlOvDecoder::create_weight_node(ggml_tensor * tensor } } + // MUL_MAT_ID expert weights are 3D GGML tensors [k, m, n_expert]. + // Keep the full reversed 4D shape when materializing non-quantized constants, + // otherwise the expert dimension is collapsed and later Gather/MatMul logic + // only sees a single expert slice. + if (!ggml_is_quantized(tensor->type) && (tensor->ne[2] > 1 || tensor->ne[3] > 1)) { + auto weight_tensor = ov::Tensor(get_ov_type(tensor), get_shape(tensor), tensor->data); + auto weight_node = std::make_shared(weight_tensor); + weight_node->set_friendly_name(tensor->name); + return weight_node; + } + // There are three cases where we need to create a new weight node: // 1. weights are in openvino_host_buffer. Weight loading to host buffer will not trigger backend_buffer_set_tensor // 2. weights are in cpu/cpu_mapped buffer. On token_embd.weight goes to case 1 or 2, depending on whether mmap or direct_io is used // 3. test-backend-ops. buffers in test-backend-ops does not set USAGE_WEIGHT so backend_buffer_set_tensor will not create weight node // GGML_LOG_DEBUG("%s: creating new weight node for %s\n", __func__, tensor->name); - static const std::set weight_types = {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, - GGML_TYPE_Q8_0, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, - GGML_TYPE_Q4_K, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K}; + static const std::set weight_types = {GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_Q8_0, + GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, GGML_TYPE_Q5_1, GGML_TYPE_Q4_K, + GGML_TYPE_Q5_K, GGML_TYPE_Q6_K}; if (weight_types.find(tensor->type) == weight_types.end()) { throw std::runtime_error("Unexpected weight tensor type: " + std::string(tensor->name) + " with type " + ggml_type_name(tensor->type)); @@ -860,6 +1042,161 @@ std::vector GgmlOvDecoder::get_input_stride(int node_idx, const std::str return get_stride(m_node_info_list[node_idx].node_inputs.at(name)); } +size_t GgmlOvDecoder::get_view_input_size(int node_idx, const std::string & name) const { + auto it = m_node_info_list[node_idx].node_inputs_views.find(name); + if (it != m_node_info_list[node_idx].node_inputs_views.end()) { + return it->second.size(); + } + return 0; +} + +size_t GgmlOvDecoder::get_view_input_offset(int node_idx, const std::string & name, size_t view_index) const { + auto it = m_node_info_list[node_idx].node_inputs_views.find(name); + if (it != m_node_info_list[node_idx].node_inputs_views.end()) { + if (view_index < it->second.size()) { + return it->second[view_index].second->view_offs; + } + } + return 0; +} + +size_t GgmlOvDecoder::get_view_input_src_offset(int node_idx, const std::string & name, size_t view_index) const { + auto it = m_node_info_list[node_idx].node_inputs_views.find(name); + if (it != m_node_info_list[node_idx].node_inputs_views.end()) { + if (view_index < it->second.size()) { + auto * view_tensor = it->second[view_index].second; + if (view_tensor && view_tensor->src[0]) { + return view_tensor->src[0]->view_offs; + } + } + } + return 0; +} + +std::vector GgmlOvDecoder::get_view_input_stride(int node_idx, + const std::string & name, + size_t view_index) const { + auto it = m_node_info_list[node_idx].node_inputs_views.find(name); + if (it != m_node_info_list[node_idx].node_inputs_views.end()) { + if (view_index < it->second.size()) { + return get_stride(it->second[view_index].second); + } + } + return {}; +} + +std::vector GgmlOvDecoder::get_view_input_src_stride(int node_idx, + const std::string & name, + size_t view_index) const { + auto it = m_node_info_list[node_idx].node_inputs_views.find(name); + if (it != m_node_info_list[node_idx].node_inputs_views.end()) { + if (view_index < it->second.size()) { + auto * view_tensor = it->second[view_index].second; + if (view_tensor && view_tensor->src[0]) { + return get_stride(view_tensor->src[0]); + } + } + } + return {}; +} + +ov::Shape GgmlOvDecoder::get_view_input_ggml_shape(int node_idx, const std::string & name, size_t view_index) const { + auto it = m_node_info_list[node_idx].node_inputs_views.find(name); + if (it != m_node_info_list[node_idx].node_inputs_views.end()) { + if (view_index < it->second.size()) { + return get_shape(it->second[view_index].second); + } + } + return {}; +} + +ov::Shape GgmlOvDecoder::get_view_input_src_ggml_shape(int node_idx, + const std::string & name, + size_t view_index) const { + auto it = m_node_info_list[node_idx].node_inputs_views.find(name); + if (it != m_node_info_list[node_idx].node_inputs_views.end()) { + if (view_index < it->second.size()) { + auto * view_tensor = it->second[view_index].second; + if (view_tensor && view_tensor->src[0]) { + return get_shape(view_tensor->src[0]); + } + } + } + return {}; +} + +ov::PartialShape GgmlOvDecoder::get_view_input_ov_shape(int node_idx, + const std::string & name, + size_t view_index) const { + auto it = m_node_info_list[node_idx].node_inputs_views.find(name); + if (it != m_node_info_list[node_idx].node_inputs_views.end()) { + if (view_index < it->second.size()) { + auto * tensor = it->second[view_index].second; + ov::PartialShape shape = ov::PartialShape{get_shape(tensor)}; + + // Check if this tensor has a dynamic dimension + auto dynamic_it = m_node_dynamic_dims.find(tensor); + if (dynamic_it != m_node_dynamic_dims.end() && dynamic_it->second != -1) { + int dynamic_dim_index = dynamic_it->second; + // GGML uses reverse indexing, so convert to OpenVINO indexing + shape[3 - dynamic_dim_index] = m_is_static ? get_static_n_tokens() : -1; + } + + return shape; + } + } + return {}; +} + +ov::PartialShape GgmlOvDecoder::get_view_input_src_ov_shape(int node_idx, + const std::string & name, + size_t view_index) const { + auto it = m_node_info_list[node_idx].node_inputs_views.find(name); + if (it != m_node_info_list[node_idx].node_inputs_views.end()) { + if (view_index < it->second.size()) { + auto * view_tensor = it->second[view_index].second; + if (view_tensor && view_tensor->src[0]) { + auto * src_tensor = view_tensor->src[0]; + ov::PartialShape shape = ov::PartialShape{get_shape(src_tensor)}; + + // Check if this tensor has a dynamic dimension + auto dynamic_it = m_node_dynamic_dims.find(src_tensor); + if (dynamic_it != m_node_dynamic_dims.end() && dynamic_it->second != -1) { + int dynamic_dim_index = dynamic_it->second; + // GGML uses reverse indexing, so convert to OpenVINO indexing + shape[3 - dynamic_dim_index] = m_is_static ? get_static_n_tokens() : -1; + } + + return shape; + } + } + } + return {}; +} + +std::string GgmlOvDecoder::get_view_input_name(int node_idx, const std::string & name, size_t view_index) const { + auto it = m_node_info_list[node_idx].node_inputs_views.find(name); + if (it != m_node_info_list[node_idx].node_inputs_views.end()) { + if (view_index < it->second.size()) { + return it->second[view_index].second->name; + } + } + return ""; +} + +std::string GgmlOvDecoder::get_view_input_src_name(int node_idx, const std::string & name, size_t view_index) const { + auto it = m_node_info_list[node_idx].node_inputs_views.find(name); + if (it != m_node_info_list[node_idx].node_inputs_views.end()) { + if (view_index < it->second.size()) { + auto * view_tensor = it->second[view_index].second; + if (view_tensor && view_tensor->src[0]) { + return view_tensor->src[0]->name; + } + } + } + return ""; +} + ov::element::Type GgmlOvDecoder::get_input_type(int node_idx, const std::string & name) const { return get_ov_type(m_node_info_list[node_idx].node_inputs.at(name)); } @@ -885,6 +1222,11 @@ ov::element::Type GgmlOvDecoder::get_output_type(const int node_idx) const { return get_ov_type(m_node_info_list[node_idx].node); } +std::vector GgmlOvDecoder::get_output_stride(int node_idx) const { + auto * ggml_tensor = m_node_info_list[node_idx].node; + return get_stride(ggml_tensor); +} + std::vector GgmlOvDecoder::get_output_names(int node_idx) const { return {m_node_info_list[node_idx].node_output_name}; } @@ -894,6 +1236,14 @@ const std::string & GgmlOvDecoder::get_op_name() const { return unknown_name; } +int32_t GgmlOvDecoder::get_op_dynamic_dim(int node_idx) const { + auto it = m_node_dynamic_dims.find(m_node_info_list[node_idx].node); + if (it == m_node_dynamic_dims.end()) { + return -1; + } + return it->second; +} + const std::string & GgmlOvDecoder::get_op_name(int node_idx) const { return m_node_info_list[node_idx].node_name; } @@ -906,6 +1256,10 @@ int32_t * GgmlOvDecoder::get_output_op_params(int node_idx) const { return m_node_info_list[node_idx].node->op_params; } +size_t GgmlOvDecoder::get_output_op_offset(int node_idx) const { + return m_node_info_list[node_idx].node->view_offs; +} + void GgmlOvDecoder::visit_subgraph(std::function, int node_idx)> node_visitor) const { for (int node_idx = 0; node_idx < m_cgraph->n_nodes; node_idx++) { if (m_cgraph->nodes[node_idx]->op == GGML_OP_NONE) { @@ -916,63 +1270,14 @@ void GgmlOvDecoder::visit_subgraph(std::function ops = { - {GGML_OP_NONE, "GGML_OP_NONE" }, - {GGML_OP_ACC, "GGML_OP_ACC" }, - {GGML_OP_ADD, "GGML_OP_ADD" }, - {GGML_OP_ADD1, "GGML_OP_ADD1" }, - {GGML_OP_CONT, "GGML_OP_CONT" }, - {GGML_OP_DIV, "GGML_OP_DIV" }, - {GGML_OP_DUP, "GGML_OP_DUP" }, - {GGML_OP_GET_ROWS, "GGML_OP_GET_ROWS" }, - {GGML_OP_MUL, "GGML_OP_MUL" }, - {GGML_OP_MUL_MAT, "GGML_OP_MUL_MAT" }, - {GGML_OP_PERMUTE, "GGML_OP_PERMUTE" }, - {GGML_OP_RESHAPE, "GGML_OP_RESHAPE" }, - {GGML_OP_RMS_NORM, "GGML_OP_RMS_NORM" }, - {GGML_OP_ROPE, "GGML_OP_ROPE" }, - {GGML_OP_SCALE, "GGML_OP_SCALE" }, - {GGML_OP_SOFT_MAX, "GGML_OP_SOFT_MAX" }, - {GGML_OP_SUB, "GGML_OP_SUB" }, - {GGML_OP_TRANSPOSE, "GGML_OP_TRANSPOSE" }, - {GGML_OP_VIEW, "GGML_OP_VIEW" }, - {GGML_OP_SET_ROWS, "GGML_OP_SET_ROWS" }, - {GGML_OP_CPY, "GGML_OP_CPY" }, - {GGML_OP_FLASH_ATTN_EXT, "GGML_OP_FLASH_ATTN_EXT"}, - }; - static const std::map unary_ops = { - {GGML_UNARY_OP_ABS, "GGML_UNARY_OP_ABS" }, - {GGML_UNARY_OP_SGN, "GGML_UNARY_OP_SGN" }, - {GGML_UNARY_OP_NEG, "GGML_UNARY_OP_NEG" }, - {GGML_UNARY_OP_STEP, "GGML_UNARY_OP_STEP" }, - {GGML_UNARY_OP_TANH, "GGML_UNARY_OP_TANH" }, - {GGML_UNARY_OP_ELU, "GGML_UNARY_OP_ELU" }, - {GGML_UNARY_OP_RELU, "GGML_UNARY_OP_RELU" }, - {GGML_UNARY_OP_SIGMOID, "GGML_UNARY_OP_SIGMOID" }, - {GGML_UNARY_OP_GELU, "GGML_UNARY_OP_GELU" }, - {GGML_UNARY_OP_GELU_QUICK, "GGML_UNARY_OP_GELU_QUICK" }, - {GGML_UNARY_OP_SILU, "GGML_UNARY_OP_SILU" }, - {GGML_UNARY_OP_HARDSWISH, "GGML_UNARY_OP_HARDSWISH" }, - {GGML_UNARY_OP_HARDSIGMOID, "GGML_UNARY_OP_HARDSIGMOID"}, - {GGML_UNARY_OP_EXP, "GGML_UNARY_OP_EXP" }, - {GGML_UNARY_OP_COUNT, "GGML_UNARY_OP_COUNT" } - }; - static const std::map glu_ops = { - {GGML_GLU_OP_SWIGLU, "GGML_GLU_OP_SWIGLU"}, - {GGML_GLU_OP_GEGLU, "GGML_GLU_OP_GEGLU" }, - {GGML_GLU_OP_REGLU, "GGML_GLU_OP_REGLU" } - }; - switch (node->op) { case GGML_OP_UNARY: - return unary_ops.at(ggml_get_unary_op(node)); + return std::string("GGML_UNARY_OP_") + ggml_unary_op_name(ggml_get_unary_op(node)); case GGML_OP_GLU: - return glu_ops.at(ggml_get_glu_op(node)); + return std::string("GGML_GLU_OP_") + ggml_glu_op_name(ggml_get_glu_op(node)); default: - return ops.at(node->op); + return std::string("GGML_OP_") + ggml_op_name(node->op); } - static const std::string unknown_op = "UNKNOWN_GGML_OP"; - return unknown_op; } const std::string & GgmlOvDecoder::get_op_type(int node_idx) const { @@ -983,3 +1288,301 @@ const std::string & GgmlOvDecoder::get_op_type() const { static const std::string unknown_op = "UNKNOWN_GGML_OP"; return unknown_op; } + +void GgmlOvDecoder::compute_node_dynamic_dims() { + auto visit_node = [&](auto && self, ggml_tensor * node) -> void { + if (!node) { + return; + } + + if (node->op == GGML_OP_CPY) { + m_node_dynamic_dims[node] = -1; + } + + if (m_node_dynamic_dims.count(node)) { + return; + } + for (int i = 0; i < GGML_MAX_SRC; i++) { + ggml_tensor * src = node->src[i]; + if (src == nullptr) { + continue; + } + struct ggml_tensor * root_src = nullptr; + // if (src->org_src) { + // root_src = src->org_src; + // } + if (root_src) { + if (is_inp_tok(root_src, node) || is_inp_pos(root_src, node) || is_output_idx(root_src, node)) { + m_node_dynamic_dims[root_src] = 0; + m_node_dynamic_dims[src] = m_node_dynamic_dims[root_src]; + continue; + } + self(self, root_src); + m_node_dynamic_dims[src] = m_node_dynamic_dims[root_src]; + } else { + if (is_inp_tok(src, node) || is_inp_pos(src, node) || is_output_idx(src, node)) { + m_node_dynamic_dims[src] = 0; + continue; + } + if (node->op == GGML_OP_VIEW && src->op == GGML_OP_NONE && !is_stateful() && !m_model_is_splitted) { + m_node_dynamic_dims[src] = 1; + continue; + } + self(self, src); + } + } + switch (node->op) { + case GGML_OP_NONE: + m_node_dynamic_dims[node] = -1; + break; + case GGML_OP_GET_ROWS: + m_node_dynamic_dims[node] = -1; + if (m_node_dynamic_dims[node->src[1]] != -1) { + auto dynamic_dim_idx = m_node_dynamic_dims[node->src[1]]; + if (dynamic_dim_idx == 0) { + m_node_dynamic_dims[node] = 1; + } else { + auto dynamic_dim_stride = node->src[1]->nb[dynamic_dim_idx] / ggml_type_size(node->src[1]->type) * + ggml_type_size(node->src[0]->type); + for (int i = 0; i < GGML_MAX_DIMS; i++) { + if (dynamic_dim_stride == node->src[0]->nb[i]) { + m_node_dynamic_dims[node] = i; + break; + } + } + } + // OPENVINO_ASSERT(dynamic_dim_value == node->ne[m_node_dynamic_dims[node]], + // "Dynamic dim value mismatch for node: " + std::string(node->name) + + // " and its src[1]: " + std::string(node->src[1]->name)); + } + break; + case GGML_OP_MUL: + case GGML_OP_MUL_MAT: + m_node_dynamic_dims[node] = -1; + if (m_node_dynamic_dims[node->src[0]] != -1) { + m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[0]]; + } + if (m_node_dynamic_dims[node->src[1]] != -1) { + m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[1]]; + } + break; + case GGML_OP_PERMUTE: + m_node_dynamic_dims[node] = -1; + if (m_node_dynamic_dims[node->src[0]] != -1) { + auto dynamic_dim_idx = m_node_dynamic_dims[node->src[0]]; + // auto dynamic_dim_value = node->src[0]->ne[dynamic_dim_idx]; + for (int i = 0; i < GGML_MAX_DIMS; i++) { + if (node->op_params[i] == dynamic_dim_idx) { + m_node_dynamic_dims[node] = i; + break; + } + } + // OPENVINO_ASSERT(dynamic_dim_value == node->ne[m_node_dynamic_dims[node]], + // "Dynamic dim value mismatch for node: " + std::string(node->name) + + // " and its src[0]: " + std::string(node->src[0]->name)); + } + break; + case GGML_OP_VIEW: { + // Use stride-based matching: the stride of a VIEW dimension directly + // encodes which source dimension it indexes into, so it uniquely + // identifies the dynamic dim even when two dims share the same size. + m_node_dynamic_dims[node] = -1; + if (m_node_dynamic_dims[node->src[0]] != -1) { + if (node->src[0]->op == GGML_OP_NONE) { + m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[0]]; + break; + } + auto dynamic_dim_idx = m_node_dynamic_dims[node->src[0]]; + auto dynamic_dim_value = node->src[0]->ne[dynamic_dim_idx]; + auto dynamic_dim_stride = + node->src[0]->nb[dynamic_dim_idx] / ggml_type_size(node->src[0]->type) * ggml_type_size(node->type); + for (int i = 0; i < GGML_MAX_DIMS; i++) { + if (node->nb[i] == dynamic_dim_stride) { + m_node_dynamic_dims[node] = i; + break; + } + } + if (m_node_dynamic_dims[node] != -1 && dynamic_dim_value != node->ne[m_node_dynamic_dims[node]]) { + m_node_dynamic_dims[node] = -1; + // std::cout << "Warning: Dynamic dim value mismatch for node: " << node->name + // << " and its src[0]: " << node->src[0]->name << std::endl; + } + } + break; + } + case GGML_OP_TRANSPOSE: + case GGML_OP_RESHAPE: { + // RESHAPE requires src[0] to be contiguous, so both src and result + // have standard compact strides: nb[i] = type_size * prod(ne[0..i-1]). + // Match src->nb[dynamic_dim] against result->nb[i] to find the output + // dimension whose flat-memory boundary aligns with the source dynamic + // boundary. This is unambiguous (result strides are strictly monotone) + // and handles merged-lower-dim cases that ne-value matching misses. + m_node_dynamic_dims[node] = -1; + if (m_node_dynamic_dims[node->src[0]] != -1) { + auto dynamic_dim_idx = m_node_dynamic_dims[node->src[0]]; + auto dynamic_dim_stride = node->src[0]->nb[dynamic_dim_idx]; + for (int i = 0; i < GGML_MAX_DIMS; i++) { + if (node->nb[i] == dynamic_dim_stride && node->ne[i] == node->src[0]->ne[dynamic_dim_idx]) { + m_node_dynamic_dims[node] = i; + break; + } + } + if (m_node_dynamic_dims[node] == -1) { + // std::cout << "Cannot determine dynamic dim for RESHAPE node: " << node->name << std::endl; + } + } + break; + } + case GGML_OP_FLASH_ATTN_EXT: { + // Output shape is hard-coded in ggml_flash_attn_ext as: + // ne = { v->ne[0], q->ne[2], q->ne[1], q->ne[3] } + // i.e. output dim 0 <- v dim 0 (head_size, static) + // output dim 1 <- q dim 2 (n_heads, static) + // output dim 2 <- q dim 1 (n_tokens, potentially dynamic) + // output dim 3 <- q dim 3 (batch, static) + // Using the fixed q-dim -> output-dim mapping table. + // q is src[0]; the mapping from q's dynamic dim to the output dim is: + // q dim 1 -> output dim 2 + // q dim 2 -> output dim 1 + // q dim 3 -> output dim 3 + // q dim 0 -> output dim 0 (head_size axis, unlikely to be dynamic) + constexpr int q_to_out[GGML_MAX_DIMS] = {0, 2, 1, 3}; + m_node_dynamic_dims[node] = -1; + if (m_node_dynamic_dims[node->src[0]] != -1) { + auto q_dynamic_dim = m_node_dynamic_dims[node->src[0]]; + m_node_dynamic_dims[node] = q_to_out[q_dynamic_dim]; + } + break; + } + case GGML_OP_CONT: + m_node_dynamic_dims[node] = -1; + if (m_node_dynamic_dims[node->src[0]] != -1) { + auto dynamic_dim_idx = m_node_dynamic_dims[node->src[0]]; + if (ggml_are_same_shape(node, node->src[0])) { + m_node_dynamic_dims[node] = dynamic_dim_idx; + } else { + size_t src_logical_nb[GGML_MAX_DIMS]; + src_logical_nb[0] = ggml_type_size(node->src[0]->type); + src_logical_nb[1] = src_logical_nb[0] * (node->src[0]->ne[0] / ggml_blck_size(node->src[0]->type)); + for (int i = 2; i < GGML_MAX_DIMS; i++) { + src_logical_nb[i] = src_logical_nb[i - 1] * node->src[0]->ne[i - 1]; + } + + auto dynamic_dim_stride = src_logical_nb[dynamic_dim_idx] / ggml_type_size(node->src[0]->type) * + ggml_type_size(node->type); + int matched_dim_count = 0; + for (int i = 0; i < GGML_MAX_DIMS; i++) { + if (node->nb[i] == dynamic_dim_stride && node->ne[i] == node->src[0]->ne[dynamic_dim_idx]) { + m_node_dynamic_dims[node] = i; + matched_dim_count++; + } + } + if (matched_dim_count != 1) { + m_node_dynamic_dims[node] = -1; + // std::cout << "Warning: Cannot determine dynamic dim for CONT node: " << node->name + // << " and its src[0]: " << node->src[0]->name << std::endl; + } + } + } + break; + case GGML_OP_RMS_NORM: + case GGML_OP_NORM: + case GGML_OP_ADD: + case GGML_OP_GLU: + case GGML_OP_ROPE: + case GGML_OP_SCALE: + case GGML_OP_SOFT_MAX: + case GGML_OP_ARGSORT: + case GGML_OP_ADD_ID: + case GGML_OP_UNARY: + m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[0]]; + break; + case GGML_OP_MUL_MAT_ID: + m_node_dynamic_dims[node] = m_node_dynamic_dims[node->src[1]]; + break; + case GGML_OP_CPY: + case GGML_OP_SET_ROWS: + m_node_dynamic_dims[node] = -1; + break; + case GGML_OP_IM2COL: { + m_node_dynamic_dims[node] = -1; + if (m_node_dynamic_dims[node->src[1]] != -1) { + const bool is_2D = node->op_params[6] == 1; + const int src_dyn = m_node_dynamic_dims[node->src[1]]; + if (is_2D) { + if (src_dyn == 0) { + m_node_dynamic_dims[node] = 1; // IW -> OW + } else if (src_dyn == 1) { + m_node_dynamic_dims[node] = 2; // IH -> OH + } else if (src_dyn == 3) { + m_node_dynamic_dims[node] = 3; // N -> N + } + } else { + if (src_dyn == 0) { + m_node_dynamic_dims[node] = 1; // IW -> OW + } else if (src_dyn == 2) { + m_node_dynamic_dims[node] = 2; // N -> N (1D: b->ne[2] is the batch/channel dim) + } + } + if (m_node_dynamic_dims[node] != -1) { + OPENVINO_ASSERT(node->src[1]->ne[src_dyn] == node->ne[m_node_dynamic_dims[node]], + "Dynamic dim value mismatch for IM2COL node: " + std::string(node->name) + + " and its src[1]: " + std::string(node->src[1]->name)); + } + } + break; + } + default: + // std::cout << "Doesn't handle node name: " << node->name << " op: " << ggml_op_name(node->op) << std::endl; + break; + } + }; + + for (int i = 0; i < m_cgraph->n_nodes; i++) { + ggml_tensor * node = m_cgraph->nodes[i]; + visit_node(visit_node, node); + } + + // print the nodes in m_cgraph name & shape with the dynamic dim (the dynamic dim is the dimension with -1 in m_node_dynamic_dims) for debugging + if (0) { + for (int i = 0; i < m_cgraph->n_nodes; i++) { + ggml_tensor * node = m_cgraph->nodes[i]; + int dynamic_dim = m_node_dynamic_dims[node]; + std::cout << "[" << i << "] " << "node_name: " << node->name << " op: " << ggml_op_name(node->op) + << " shape: ["; + for (int j = 0; j < 4; j++) { + if (j == dynamic_dim) { + std::cout << "*"; + } else { + std::cout << node->ne[j]; + } + if (j < 3) { + std::cout << ", "; + } + } + std::cout << "]" << std::endl; + // print the src name & shape with the dynamic dim for debugging + for (int j = 0; j < GGML_MAX_SRC; j++) { + ggml_tensor * src = node->src[j]; + if (src == nullptr) { + continue; + } + int src_dynamic_dim = m_node_dynamic_dims[src]; + std::cout << " [" << j << "] src_name: " << src->name << " ["; + for (int k = 0; k < 4; k++) { + if (k == src_dynamic_dim) { + std::cout << "*"; + } else { + std::cout << src->ne[k]; + } + if (k < 3) { + std::cout << ", "; + } + } + std::cout << "]" << std::endl; + } + std::cout << std::endl; + } + } +} diff --git a/ggml/src/ggml-openvino/ggml-decoder.h b/ggml/src/ggml-openvino/ggml-decoder.h index 3ae25ddda320..ae545f47e5fe 100644 --- a/ggml/src/ggml-openvino/ggml-decoder.h +++ b/ggml/src/ggml-openvino/ggml-decoder.h @@ -1,6 +1,7 @@ #pragma once -#include "ggml-quants.h" +#include "ggml-backend-impl.h" +#include "ggml-backend.h" #include "ggml.h" #include "openvino/decoder.h" @@ -14,21 +15,21 @@ struct ModelParams { int ctx = -1; - int ctx_swa = -1; int ctx_per_seq = -1; int ctx_per_seq_swa = -1; int n_seq = 1; - int n_heads = -1; int n_heads_kv = -1; int head_size = -1; int32_t rope_params[15]; + bool mixed_rope_params = false; std::vector swa_layers; std::vector kv_names; size_t kv_buffer_ctx_id = 0; bool same_rope_params(const ModelParams & other) const { - return memcmp(rope_params, other.rope_params, sizeof(int32_t) * 15) == 0; + return mixed_rope_params == other.mixed_rope_params && + memcmp(rope_params, other.rope_params, sizeof(int32_t) * 15) == 0; } bool can_reuse_dynamically(const ModelParams & other) const { return same_rope_params(other); } @@ -56,12 +57,14 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { std::string node_name; std::string node_op_type; std::map node_inputs; + std::map>> node_inputs_views; std::vector node_inputs_names; ggml_tensor * node_output; std::string node_output_name; int node_op_case = 0; void * data_addr; }; + // Graph decoder GgmlOvDecoder(ggml_cgraph * cgraph, ModelParams & model_params, @@ -69,6 +72,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { std::map> & model_weights, bool is_static, bool is_stateful = false, + bool model_is_splitted = false, bool is_prefill = false, int prefill_chunk_size = 256); @@ -84,6 +88,42 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { virtual std::vector get_input_stride(int node_idx, const std::string & name) const override; + virtual size_t get_view_input_size(int node_idx, const std::string & name) const override; + + virtual size_t get_view_input_offset(int node_idx, const std::string & name, size_t view_index) const override; + + virtual size_t get_view_input_src_offset(int node_idx, const std::string & name, size_t view_index) const override; + + virtual std::vector get_view_input_stride(int node_idx, + const std::string & name, + size_t view_index) const override; + + virtual std::vector get_view_input_src_stride(int node_idx, + const std::string & name, + size_t view_index) const override; + + virtual ov::Shape get_view_input_ggml_shape(int node_idx, + const std::string & name, + size_t view_index) const override; + + virtual ov::Shape get_view_input_src_ggml_shape(int node_idx, + const std::string & name, + size_t view_index) const override; + + virtual ov::PartialShape get_view_input_ov_shape(int node_idx, + const std::string & name, + size_t view_index) const override; + + virtual ov::PartialShape get_view_input_src_ov_shape(int node_idx, + const std::string & name, + size_t view_index) const override; + + virtual std::string get_view_input_name(int node_idx, const std::string & name, size_t view_index) const override; + + virtual std::string get_view_input_src_name(int node_idx, + const std::string & name, + size_t view_index) const override; + virtual ov::element::Type get_input_type(int node_idx, const std::string & name) const override; virtual size_t get_input_size() const override; @@ -106,10 +146,14 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { virtual ov::element::Type get_output_type(int node_idx) const override; + virtual std::vector get_output_stride(int node_idx) const override; + virtual int32_t * get_input_op_params(int node_idx, const std::string & name) const override; virtual int32_t * get_output_op_params(int node_idx) const override; + virtual size_t get_output_op_offset(int node_idx) const override; + virtual std::vector get_output_names(int node_idx) const override; virtual const std::string & get_op_type() const override; @@ -120,7 +164,10 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { virtual const std::string & get_op_name(int node_idx) const override; - virtual void visit_subgraph(std::function, int node_idx)> node_visitor) const override; + virtual int32_t get_op_dynamic_dim(int node_idx) const override; + + virtual void visit_subgraph( + std::function, int node_idx)> node_visitor) const override; ggml_tensor * get_input_ggml_tensor(const std::string & name) const { return m_inputs.at(name); } @@ -142,16 +189,12 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { return m_model_weights; } - virtual std::vector get_model_output_names() const override { - return m_model_output_names; - } + virtual std::vector get_model_output_names() const override { return m_model_output_names; } const std::map & get_model_outputs() const { return m_model_outputs; } virtual int get_ctx_size() const { return m_model_params.ctx; } - virtual int get_ctx_swa_size() const { return m_model_params.ctx_swa; } - virtual int get_ctx_per_seq() const { return m_model_params.ctx_per_seq; } virtual int get_ctx_per_seq_swa() const { return m_model_params.ctx_per_seq_swa; } @@ -169,13 +212,21 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { virtual int32_t * get_rope_params() const override { return const_cast(m_model_params.rope_params); } + virtual bool has_mixed_rope_params() const override { return m_model_params.mixed_rope_params; } + virtual std::map get_kv_param_res_names() const override; virtual bool is_static() const override { return m_is_static; } virtual bool is_stateful() const override { return m_is_stateful; } - ov::PartialShape get_graph_input_shape(const ggml_tensor * op, const ggml_tensor * input) const; + int get_static_n_tokens() const { return m_is_prefill ? m_prefill_chunk_size : 1; } + + virtual bool is_splited_model() const override { return m_model_is_splitted; } + + ov::PartialShape get_graph_input_shape(const ggml_tensor * op, + const ggml_tensor * input, + int dynamic_dim_index = -1) const; static void dump_cgraph(const ggml_cgraph * cgraph, std::string & filename); @@ -205,6 +256,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { bool m_is_prefill = false; bool m_naive = false; int m_prefill_chunk_size = 0; + bool m_model_is_splitted = false; // label the cgraph is splited or not static ov::Shape get_shape(const ggml_tensor * tensor); static std::vector get_stride(const ggml_tensor * tensor); @@ -227,7 +279,8 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { } inline static bool is_inp_mask(const ggml_tensor * tensor, const ggml_tensor * op) { - return op->op == GGML_OP_CPY || (op->op == GGML_OP_FLASH_ATTN_EXT && tensor == op->src[3]); + return op->op == GGML_OP_CPY || (op->op == GGML_OP_FLASH_ATTN_EXT && tensor == op->src[3]) || + (op->op == GGML_OP_SOFT_MAX && tensor == op->src[1]); } inline static bool is_rope_freqs_weight(const ggml_tensor * tensor, const ggml_tensor * op) { @@ -235,7 +288,8 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { } inline static bool is_kvcache(const ggml_tensor * tensor, const ggml_tensor * op) { - return op->op == GGML_OP_SET_ROWS && op->src[2] == tensor; + return tensor->buffer->usage == GGML_BACKEND_BUFFER_USAGE_ANY || + (op != nullptr && op->op == GGML_OP_SET_ROWS && op->src[2] == tensor); } inline static bool is_kv_idx(const ggml_tensor * tensor, const ggml_tensor * op) { @@ -243,23 +297,18 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { } inline static bool is_output_idx(const ggml_tensor * tensor, const ggml_tensor * op) { - return op->op == GGML_OP_GET_ROWS && tensor == op->src[1] && op->src[0]->op != GGML_OP_NONE; + return op->op == GGML_OP_GET_ROWS && tensor == op->src[1] && op->src[0]->op != GGML_OP_NONE && + op->src[1]->op == GGML_OP_NONE; } - static std::string get_graph_input_ov_name(const ggml_tensor * tensor, const ggml_tensor * op) { - if (is_inp_tok(tensor, op)) { - return "inp_tokens"; - } + std::string get_graph_input_ov_name(const ggml_tensor * tensor, const ggml_tensor * op) { if (is_inp_pos(tensor, op)) { return "inp_pos"; } if (is_inp_emb(tensor, op)) { return "embd"; } - if (is_output_idx(tensor, op)) { - return "inp_out_ids"; - } - if (is_inp_mask(tensor, op)) { + if (is_stateful() && is_inp_mask(tensor, op)) { return std::string(tensor->name).find("swa") == std::string::npos ? "self_kq_mask" : "self_kq_mask_swa"; } return tensor->name; @@ -272,6 +321,9 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { void compute_model_inputs(); void compute_model_outputs(); + // Infer and propagate dynamic-dimension indices for all tensors in the GGML graph. + void compute_node_dynamic_dims(); + void validate_cgraph() const; ggml_cgraph * m_cgraph = nullptr; @@ -284,6 +336,7 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { std::map m_model_outputs; std::vector m_model_output_names; std::vector m_node_info_list; + std::map m_node_dynamic_dims; ModelParams m_model_params; ComputeParams m_compute_params; @@ -291,4 +344,4 @@ class GgmlOvDecoder : public ov::frontend::ggml::GgmlDecoder { void print_tensor_address_map(const ggml_cgraph * cgraph); -int extract_layer_from_name(const std::string & name); +std::optional extract_layer_from_name(const std::string & name); diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp index 4140136aca25..d9ad7be734d1 100644 --- a/ggml/src/ggml-openvino/ggml-openvino-extra.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino-extra.cpp @@ -3,6 +3,7 @@ #include "ggml-impl.h" #include "ggml.h" +#include #include #include #include @@ -22,7 +23,38 @@ void ggml_openvino_device_config::init() { if (initialized) { return; } - device_name = getenv("GGML_OPENVINO_DEVICE") ? getenv("GGML_OPENVINO_DEVICE") : "CPU"; + + // All recognized GGML_OPENVINO_* env vars. Their values are cached here + // once at backend init time and read back via ggml_openvino_getenv_str() + // (raw string) or ggml_openvino_getenv_int() (integer / boolean toggle). + static constexpr const char * env_var_names[] = { + // String values (use ggml_openvino_getenv_str) + "GGML_OPENVINO_DEVICE", + "GGML_OPENVINO_CACHE_DIR", + // Integer values (use ggml_openvino_getenv_int) + "GGML_OPENVINO_PREFILL_CHUNK_SIZE", + // Boolean toggles (treated as int flags via ggml_openvino_getenv_int) + "GGML_OPENVINO_STATEFUL_EXECUTION", + "GGML_OPENVINO_PROFILING", + "GGML_OPENVINO_DUMP_CGRAPH", + "GGML_OPENVINO_DUMP_IR", + "GGML_OPENVINO_DEBUG_INPUT", + "GGML_OPENVINO_DEBUG_OUTPUT", + "GGML_OPENVINO_PRINT_CGRAPH_TENSOR_ADDRESS", + "GGML_OPENVINO_ENABLE_CACHE", + "GGML_OPENVINO_DISABLE_CACHE", + "GGML_OPENVINO_DISABLE_KV_SLICE", + "GGML_OPENVINO_MANUAL_GQA_ATTN", + }; + + for (const char * const & env_var : env_var_names) { + auto * env = getenv(env_var); + if (env) { + environment_variables[env_var] = env; + } + } + + device_name = ggml_openvino_getenv_str("GGML_OPENVINO_DEVICE", "CPU"); auto available_devices = ov_singleton_core().get_available_devices(); if (std::find(available_devices.begin(), available_devices.end(), device_name) == available_devices.end()) { GGML_LOG_WARN("GGML OpenVINO Backend: device %s is not available, fallback to CPU\n", device_name.c_str()); @@ -30,7 +62,7 @@ void ggml_openvino_device_config::init() { } is_npu = (device_name == "NPU"); - auto * cache_dir = getenv("GGML_OPENVINO_CACHE_DIR"); + const char * cache_dir = ggml_openvino_getenv_str("GGML_OPENVINO_CACHE_DIR"); if (device_name == "NPU") { compile_config = { {"NPU_COMPILER_DYNAMIC_QUANTIZATION", "YES" }, @@ -119,6 +151,23 @@ const std::string & ggml_openvino_get_device_name() { return ggml_openvino_get_device_config().device_name; } +// Get the value of a GGML_OPENVINO_* env var as a string. Returns +// default_value when the var is unset or set to an empty string. +const char * ggml_openvino_getenv_str(const char * var, const char * default_value) { + auto & env_map = ggml_openvino_get_device_config().environment_variables; + auto it = env_map.find(var); + return (it == env_map.end() || it->second.empty()) ? default_value : it->second.c_str(); +} + +// Get the value of a GGML_OPENVINO_* env var as an int (via std::atoi). +// Returns default_value (0) when the var is unset or empty. Used for both +// integer settings (e.g. GGML_OPENVINO_PREFILL_CHUNK_SIZE) and boolean +// toggles: "0" disables, any non-zero integer enables. +int ggml_openvino_getenv_int(const char * var, int default_value) { + const char * v = ggml_openvino_getenv_str(var, nullptr); + return v ? std::atoi(v) : default_value; +} + // Check if running on NPU bool ggml_openvino_is_npu() { return ggml_openvino_get_device_config().is_npu; @@ -173,7 +222,8 @@ std::optional ggml_openvino_get_requant_type(const ggml_tensor * return std::nullopt; } if (strncmp(tensor->name, "token_embd.weight", 17) == 0) { - return ((ggml_openvino_is_npu() && tensor->type == GGML_TYPE_Q6_K) ? ExtraQuantType::F16 : ExtraQuantType::Q8_0_C); + return ((ggml_openvino_is_npu() && tensor->type == GGML_TYPE_Q6_K) ? ExtraQuantType::F16 : + ExtraQuantType::Q8_0_C); } if (strncmp(tensor->name, "output.weight", 13) == 0) { return ExtraQuantType::Q8_0_C; @@ -298,6 +348,10 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten layout.is_symmetric = true; break; + case GGML_TYPE_Q5_1: + // u8 weights (5-bit values), asymmetric (scale + zero point) + break; + case GGML_TYPE_Q6_K: layout.weights_per_block = 16; layout.is_symmetric = true; diff --git a/ggml/src/ggml-openvino/ggml-openvino-extra.h b/ggml/src/ggml-openvino/ggml-openvino-extra.h index cd0baf4a681b..c2654fbfa1b8 100644 --- a/ggml/src/ggml-openvino/ggml-openvino-extra.h +++ b/ggml/src/ggml-openvino/ggml-openvino-extra.h @@ -64,6 +64,7 @@ struct ggml_openvino_device_config { bool initialized = false; std::optional remote_context; ov::AnyMap compile_config; + std::unordered_map environment_variables; cl_command_queue cl_queue = nullptr; void init(); @@ -79,6 +80,22 @@ void ggml_openvino_init_device_config(); // Get the device name const std::string & ggml_openvino_get_device_name(); +// Environment variable accessors. All GGML_OPENVINO_* env vars are read once +// during backend init and cached on the device config; consumers must go +// through these helpers (never call ::getenv directly) so behavior stays +// consistent and centralized. +// +// Use ggml_openvino_getenv_str() for string / path values +// (e.g. GGML_OPENVINO_DEVICE, GGML_OPENVINO_CACHE_DIR). The optional +// default_value is returned when the var is unset or empty. +// +// Use ggml_openvino_getenv_int() for boolean toggles and integer settings. +// It returns std::atoi(value) when set, otherwise default_value. For +// boolean use, `if (ggml_openvino_getenv_int(name))` is true iff the value +// is a non-zero integer (so "0" disables, "1" enables). +const char * ggml_openvino_getenv_str(const char * var, const char * default_value = nullptr); +int ggml_openvino_getenv_int(const char * var, int default_value = 0); + // Check if running on NPU bool ggml_openvino_is_npu(); @@ -115,9 +132,9 @@ struct ggml_openvino_weight_extra : public ggml_openvino_extra_base { // Extra data for quantized weight tensors - stores extracted weights/scales/zp and weight node struct ggml_openvino_quantized_weight_extra : public ggml_openvino_extra_base { - ov::Tensor weights; // U4 or U8 extracted weights - ov::Tensor scales; // F16 scales - ov::Tensor zp; // U4 or U8 zero points (same type as weights) + ov::Tensor weights; // U4 or U8 extracted weights + ov::Tensor scales; // F16 scales + ov::Tensor zp; // U4 or U8 zero points (same type as weights) std::shared_ptr weight_node; // Pre-built OpenVINO weight subgraph ggml_openvino_quantized_weight_extra(ov::Tensor w, ov::Tensor s, ov::Tensor z, std::shared_ptr n) : @@ -132,8 +149,9 @@ struct ggml_openvino_quantized_weight_extra : public ggml_openvino_extra_base { struct ggml_openvino_tensor_extra : public ggml_openvino_extra_base { std::shared_ptr tensor; // For direct use with infer_request - explicit ggml_openvino_tensor_extra(std::shared_ptr t) - : ggml_openvino_extra_base(Type::TENSOR), tensor(std::move(t)) {} + explicit ggml_openvino_tensor_extra(std::shared_ptr t) : + ggml_openvino_extra_base(Type::TENSOR), + tensor(std::move(t)) {} }; // ===================================================== @@ -152,11 +170,11 @@ struct ggml_openvino_extracted_layout { size_t zp_size = 0; // Size of zero points in bytes (U4 or U8) bool is_u4; // true for U4 weights, false for U8 int64_t weights_per_block; // weights per scale/zp block - bool is_symmetric; // true for symmetric quantization + bool is_symmetric; // true for symmetric quantization // Requantization info - bool is_requant = false; // true if this tensor needs requantization - std::optional requant_type; // target requant type if is_requant + bool is_requant = false; // true if this tensor needs requantization + std::optional requant_type; // target requant type if is_requant }; // Calculate the buffer layout for extracted quantized data @@ -164,6 +182,9 @@ ggml_openvino_extracted_layout ggml_openvino_get_extracted_layout(const ggml_ten ggml_openvino_tensor_extra * ggml_openvino_create_tensor_extra(const ggml_tensor * tensor, bool is_remote); +// Check if a tensor's buffer uses remote (device) memory (e.g. GPU USM) +bool ggml_openvino_buffer_is_remote(const ggml_tensor * tensor); + // Register an extra with the tensor's OpenVINO buffer context for proper lifetime management. // This sets tensor->extra and tracks the extra in the buffer context for cleanup. void ggml_openvino_buffer_register_extra(ggml_tensor * tensor, ggml_openvino_extra_base * extra); diff --git a/ggml/src/ggml-openvino/ggml-openvino.cpp b/ggml/src/ggml-openvino/ggml-openvino.cpp index 4f3ebf2536b0..659dbd4b5acb 100644 --- a/ggml/src/ggml-openvino/ggml-openvino.cpp +++ b/ggml/src/ggml-openvino/ggml-openvino.cpp @@ -4,13 +4,14 @@ #include "ggml-backend.h" #include "ggml-impl.h" #include "ggml-openvino-extra.h" +#include "ggml-openvino/openvino/op_table.h" #include "ggml-openvino/utils.h" #include "ggml-quants.h" #include "ggml.h" #include -#include #include +#include #include #include #include @@ -146,8 +147,7 @@ static void * ggml_backend_openvino_buffer_get_base(ggml_backend_buffer_t buffer } static bool is_stateful_enabled() { - static const auto * stateful = getenv("GGML_OPENVINO_STATEFUL_EXECUTION"); - return stateful && *stateful != '\0' && strcmp(stateful, "0") != 0; + return ggml_openvino_getenv_int("GGML_OPENVINO_STATEFUL_EXECUTION") != 0; } static enum ggml_status ggml_backend_openvino_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { @@ -367,11 +367,9 @@ static bool ggml_backend_openvino_buffer_cpy_tensor(ggml_backend_buffer_t buffer ggml_backend_openvino_buffer_context * src_ctx = (ggml_backend_openvino_buffer_context *) src->buffer->context; if (src_ctx->is_remote) { - cl_int err = - mem_cpy_fn(queue, CL_TRUE, dst->data, src->data, ggml_nbytes(src), 0, nullptr, nullptr); + cl_int err = mem_cpy_fn(queue, CL_TRUE, dst->data, src->data, ggml_nbytes(src), 0, nullptr, nullptr); if (err != CL_SUCCESS) { - GGML_LOG_ERROR("%s: clEnqueueMemcpyINTEL (device-to-device) failed with error %d\n", __func__, - err); + GGML_LOG_ERROR("%s: clEnqueueMemcpyINTEL (device-to-device) failed with error %d\n", __func__, err); return false; } return true; @@ -579,6 +577,17 @@ size_t ggml_backend_openvino_buffer_get_ctx_id(ggml_backend_buffer_t buffer) { return ctx->id; } +bool ggml_openvino_buffer_is_remote(const ggml_tensor * tensor) { + if (tensor == nullptr || tensor->buffer == nullptr) { + return false; + } + if (!ggml_backend_buffer_is_openvino(tensor->buffer)) { + return false; + } + auto * ctx = static_cast(tensor->buffer->context); + return ctx->is_remote; +} + void ggml_openvino_buffer_register_extra(ggml_tensor * tensor, ggml_openvino_extra_base * extra) { GGML_ASSERT(tensor != nullptr); GGML_ASSERT(tensor->buffer != nullptr); @@ -785,6 +794,18 @@ static bool has_view_op_input(const ggml_tensor * op) { return false; } +static bool has_non_contiguous_view_input(const ggml_tensor * op) { + for (int i = 0; i < GGML_MAX_SRC; i++) { + if (op->src[i] == nullptr) { + break; + } + if (op->src[i]->op == GGML_OP_VIEW && !ggml_is_contiguous(op->src[i])) { + return true; + } + } + return false; +} + static bool is_supported_flash_attn_pattern(const ggml_tensor * op) { // pattern of q,k,v should be q->op==PERMUTE, q->src[0]->op==VIEW, q->src[0]->src[0]->view_src==nullptr for (int i = 0; i < 3; i++) { @@ -797,17 +818,107 @@ static bool is_supported_flash_attn_pattern(const ggml_tensor * op) { return true; } +static bool is_gemma3n_flash_attn_pattern(const ggml_tensor * op) { + if (!is_supported_flash_attn_pattern(op)) { + return false; + } + + const ggml_tensor * q_base = + op->src[0] != nullptr && op->src[0]->src[0] != nullptr ? op->src[0]->src[0]->src[0] : nullptr; + const ggml_tensor * k_base = + op->src[1] != nullptr && op->src[1]->src[0] != nullptr ? op->src[1]->src[0]->src[0] : nullptr; + const ggml_tensor * v_base = + op->src[2] != nullptr && op->src[2]->src[0] != nullptr ? op->src[2]->src[0]->src[0] : nullptr; + + if (q_base == nullptr || q_base->op != GGML_OP_ROPE) { + return false; + } + + // gemma3n direct attention path (no KV cache): q=ROPE, k=ROPE, v=RMS_NORM + // Only match this specific pattern to avoid falsely catching other models + // (e.g. Gemma4) that also use scale=1.0 with KV-cache backed attention. + const bool is_qkv_direct = + k_base != nullptr && v_base != nullptr && k_base->op == GGML_OP_ROPE && v_base->op == GGML_OP_RMS_NORM; + + return is_qkv_direct; +} + +static bool checked_mul_size(size_t a, size_t b, size_t & out) { + if (a == 0 || b == 0) { + out = 0; + return true; + } + if (a > SIZE_MAX / b) { + return false; + } + out = a * b; + return true; +} + +static bool mul_mat_id_requires_large_tmp(const ggml_tensor * op) { + const ggml_tensor * as = op->src[0]; + const ggml_tensor * ids = op->src[2]; + if (as == nullptr || ids == nullptr) { + return true; + } + + // The current OpenVINO translation materializes selected expert weights with + // shape [n_tokens, n_used, rows, k]. Skip cases that would create a very + // large temporary on GPU and let the scheduler fall back instead. + size_t tmp_elems = 1; + if (!checked_mul_size(tmp_elems, static_cast(ids->ne[1]), tmp_elems) || + !checked_mul_size(tmp_elems, static_cast(ids->ne[0]), tmp_elems) || + !checked_mul_size(tmp_elems, static_cast(as->ne[1]), tmp_elems) || + !checked_mul_size(tmp_elems, static_cast(as->ne[0]), tmp_elems)) { + return true; + } + + size_t tmp_bytes = 0; + if (!checked_mul_size(tmp_elems, sizeof(float), tmp_bytes)) { + return true; + } + + static constexpr size_t mul_mat_id_tmp_limit = 1ULL << 30; // 1 GiB + return tmp_bytes > mul_mat_id_tmp_limit; +} + static bool is_op_unsupported_case(const ggml_tensor * op) { switch (op->op) { + case GGML_OP_CONCAT: { + if (op->type == GGML_TYPE_I64) { + return true; + } + break; + } case GGML_OP_GET_ROWS: case GGML_OP_SET_ROWS: { if (op->ne[3] != 1) { return true; } + if (op->ne[0] == 256 && (op->src[0]->type == GGML_TYPE_Q4_K || op->src[0]->type == GGML_TYPE_Q5_K)) { + // ERR = 0.000000306 > 0.000000100 GET_ROWS(type=q4_K,n=256,m=5,r=4,be1=1,be2=1,v=0) + // ERR = 0.000000197 > 0.000000100 GET_ROWS(type=q5_K,n=256,m=5,r=4,be1=1,be2=1,v=0) + return true; + } + + // Keep the MoE routing weights gather on CPU for GPU runs. Splitting + // only at the later SUM/CLAMP/DIV nodes still leaves this routing path + // numerically unstable for arctic-style MoE graphs. + if (strncmp(op->name, "ffn_moe_weights", sizeof("ffn_moe_weights") - 1) == 0) { + return true; + } + break; + } + case GGML_OP_RESHAPE: { + if (strncmp(op->name, "ffn_moe_weights", sizeof("ffn_moe_weights") - 1) == 0 || + strncmp(op->name, "ffn_norm_exps", sizeof("ffn_norm_exps") - 1) == 0) { + return true; + } break; } case GGML_OP_ADD: - case GGML_OP_MUL: { + case GGML_OP_MUL: + case GGML_OP_SUB: { if (op->src[1]->op == GGML_OP_PERMUTE) { return true; } @@ -818,30 +929,79 @@ static bool is_op_unsupported_case(const ggml_tensor * op) { } break; } + case GGML_OP_ADD_ID: { + // Keep support aligned with the CPU backend implementation, which only handles f32 inputs/output and i32 ids. + if (op->type != GGML_TYPE_F32 || op->src[0]->type != GGML_TYPE_F32 || op->src[1]->type != GGML_TYPE_F32 || + op->src[2]->type != GGML_TYPE_I32) { + return true; + } + break; + } + case GGML_OP_DIV: { + bool requires_broadcast = false; + for (int i = 0; i < 4; i++) { + if (op->src[0]->ne[i] == op->src[1]->ne[i]) { + continue; + } + + if (op->src[0]->ne[i] != 1 && op->src[1]->ne[i] != 1) { + return true; + } + + requires_broadcast = true; + } + + // The GPU plugin can fuse broadcast DIV into the preceding FFN GEMM path + // and produce infs for per-channel scale vectors. Keep those DIVs on CPU + // until the fused GPU kernel is reliable. (falied case llama-arch-test mpt) + if (requires_broadcast && ggml_openvino_get_device_name() == "GPU") { + return true; + } + + // qwen3next MoE weight normalization is numerically sensitive on the GPU + // path. Keep the normalization divide on CPU to match the reference. + if (strncmp(op->name, "ffn_moe_weights_norm", sizeof("ffn_moe_weights_norm") - 1) == 0) { + return true; + } + break; + } case GGML_OP_SOFT_MAX: { if (op->src[2] != nullptr) { // GGML_LOG_WARN("OpenVINO backend does not support SOFT_MAX with sinks\n"); return true; } - float scale = 1.0f; - float max_bias = 0.0f; - const auto * op_params = op->op_params; - memcpy(&scale, (const float *) op_params + 0, sizeof(float)); - memcpy(&max_bias, (const float *) op_params + 1, sizeof(float)); - if (max_bias > 0) { - // GGML_LOG_WARN("OpenVINO backend does not support SOFT_MAX with max_bias > 0\n"); + + if (strncmp(op->name, "ffn_moe_probs", sizeof("ffn_moe_probs") - 1) == 0) { + return true; + } + + // GPU execution of the MoE routing weights softmax is numerically unstable + // when fused with the surrounding GET_ROWS/reshape path. Keep this softmax + // on CPU so the scheduler splits at the same boundary that restores parity. + if (op->src[0] != nullptr && op->src[0]->op == GGML_OP_RESHAPE && op->src[0]->src[0] != nullptr && + strncmp(op->src[0]->src[0]->name, "ffn_moe_weights", sizeof("ffn_moe_weights") - 1) == 0) { return true; } break; } - case GGML_OP_FLASH_ATTN_EXT: { - if (op->src[4] != nullptr) { - // GGML_LOG_WARN("OpenVINO backend does not support FLASH_ATTN_EXT with sinks\n"); + case GGML_OP_SUM_ROWS: { + if (strncmp(op->name, "ffn_moe_weights_sum", sizeof("ffn_moe_weights_sum") - 1) == 0) { return true; } - if (!is_supported_flash_attn_pattern(op)) { + + // if the input is PERMUTE skip + if (op->src[0]->op == GGML_OP_PERMUTE) { + return true; + } + break; + } + case GGML_OP_CLAMP: { + if (strncmp(op->name, "ffn_moe_weights_sum_clamped", sizeof("ffn_moe_weights_sum_clamped") - 1) == 0) { return true; } + break; + } + case GGML_OP_FLASH_ATTN_EXT: { float scale = 1.0f; float max_bias = 0.0f; float logit_softcap = 0.0f; @@ -849,6 +1009,21 @@ static bool is_op_unsupported_case(const ggml_tensor * op) { memcpy(&scale, (const float *) op_params + 0, sizeof(float)); memcpy(&max_bias, (const float *) op_params + 1, sizeof(float)); memcpy(&logit_softcap, (const float *) op_params + 2, sizeof(float)); + + // Keep gemma3n flash-attn pattern on CPU for GPU runs to avoid + // accuracy drift in the OpenVINO path. Restrict by scale=1.0 to avoid + // affecting non-gemma3n models such as Llama-3.2. + if (fabsf(scale - 1.0f) < 1e-6f && is_gemma3n_flash_attn_pattern(op)) { + return true; + } + + if (op->src[4] != nullptr) { + // GGML_LOG_WARN("OpenVINO backend does not support FLASH_ATTN_EXT with sinks\n"); + return true; + } + if (!is_supported_flash_attn_pattern(op)) { + return true; + } if (max_bias > 0) { // GGML_LOG_WARN("OpenVINO backend does not support FLASH_ATTN_EXT with max_bias > 0\n"); return true; @@ -868,30 +1043,44 @@ static bool is_op_unsupported_case(const ggml_tensor * op) { break; } case GGML_OP_CPY: { - if (op->src[1] != op) { - // GGML_LOG_WARN("OpenVINO backend only supports CPY that is a cast\n"); + if (op->src[0]->type == GGML_TYPE_BF16 || op->src[1]->type == GGML_TYPE_BF16) { + // GGML_LOG_WARN("OpenVINO backend does not support CPY with non-contiguous data or bf16 types\n"); + return true; + } + // op test case with non-contiguous src or dst + if ((op->ne[0] == 3 && op->ne[1] == 4 && op->ne[2] == 3 && op->ne[3] == 2) || + (op->ne[0] == 1 && op->ne[1] == 4 && op->ne[2] == 3 && op->ne[3] == 2) || + (op->ne[0] == 2 && op->ne[1] == 4 && op->ne[2] == 3 && op->ne[3] == 2)) { + return true; + } + // CPY into a strided view of a larger buffer (recurrent-state snapshots) not supported + if (op->view_src && ggml_nbytes(op) != ggml_nbytes(op->view_src)) { return true; } break; } case GGML_OP_MUL_MAT: { - if (op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F16) { - // Has accuracy issue, try enabling this and see `test-backend-ops -o "MUL_MAT"` - // GGML_LOG_WARN("OpenVINO backend does not support MUL_MAT with two F16 tensors\n"); + if (ggml_openvino_get_device_name() == "GPU" && op->src[1]->op == GGML_OP_SOFT_MAX && + op->src[0]->op == GGML_OP_CONT && op->src[0]->src[0] != nullptr && + op->src[0]->src[0]->op == GGML_OP_TRANSPOSE && op->src[0]->src[0]->src[0] != nullptr && + op->src[0]->src[0]->src[0]->op == GGML_OP_PERMUTE) { return true; } if (op->src[0]->ne[3] != op->src[1]->ne[3] && op->src[0]->ne[3] != 1 && op->src[1]->ne[3] != 1) { return true; } - if (op->src[0]->op == GGML_OP_PERMUTE || op->src[1]->op == GGML_OP_PERMUTE) { + if (op->src[0]->op == GGML_OP_VIEW && op->src[1]->op == GGML_OP_VIEW) { return true; } - if (ggml_is_quantized(op->src[0]->type) && op->src[0]->ne[1] == 1) { - // MUL_MAT(type_a=q4_0,type_b=f32,m=1,n=2048,k=8192,bs=[1,1],nr=[1,1],per=[0,1,2,3],k_v=0,o=1) - // triggers a bug in ov matmul_shape_inference.hpp + break; + } + case GGML_OP_MUL_MAT_ID: { + if (strncmp(op->name, "ffn_moe_gate_up", sizeof("ffn_moe_gate_up") - 1) == 0 || + strncmp(op->name, "ffn_moe_down", sizeof("ffn_moe_down") - 1) == 0) { return true; } - if (op->src[0]->op == GGML_OP_VIEW && op->src[1]->op == GGML_OP_VIEW) { + + if (mul_mat_id_requires_large_tmp(op)) { return true; } break; @@ -909,7 +1098,7 @@ static bool is_op_unsupported_case(const ggml_tensor * op) { // op->src[0]->ne[0]); return true; } - if (op->type != GGML_TYPE_F32) { + if (op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16) { // GGML_LOG_WARN("OpenVINO backend does not support ROPE with type %s\n", ggml_type_name(op->type)); return true; } @@ -930,15 +1119,54 @@ static bool is_op_unsupported_case(const ggml_tensor * op) { } break; } - default: + case GGML_OP_TRANSPOSE: { + // if the type is bf16, will return true + if (op->type == GGML_TYPE_BF16) { + // GGML_LOG_WARN("OpenVINO backend does not support CONT with BF16 type\n"); + return true; + } break; } - if (op->op == GGML_OP_GET_ROWS) { - if (op->ne[0] == 256 && (op->src[0]->type == GGML_TYPE_Q4_K || op->src[0]->type == GGML_TYPE_Q5_K)) { - // ERR = 0.000000306 > 0.000000100 GET_ROWS(type=q4_K,n=256,m=5,r=4,be1=1,be2=1,v=0) - // ERR = 0.000000197 > 0.000000100 GET_ROWS(type=q5_K,n=256,m=5,r=4,be1=1,be2=1,v=0) + case GGML_OP_GATED_DELTA_NET: { + // enable after https://github.com/openvinotoolkit/openvino/pull/35917 is included in OV release + return true; + // if (ggml_openvino_get_device_name() == "GPU" && op->src[0]->ne[2] > 1) { + // // CVS-186471 + // return true; + // } + if (op->src[2]->op == GGML_OP_PERMUTE) { + return true; + } + // kda (per-key-dimension gating) not supported by fused GatedDeltaNet op + if (op->src[3]->ne[0] != 1) { + return true; + } + // v_repeat > 1 (GQA): ggml uses modulo head mapping (h_q = h_v % H_k) + // but the fused op uses consecutive mapping (h_q = h_v / group_size) + if (op->src[2]->ne[1] != op->src[0]->ne[1]) { + return true; + } + // K > 1 (multiple state snapshots) not supported by fused op + if (op->src[5]->ne[1] > 1) { + return true; + } + break; + } + case GGML_OP_SSM_CONV: { + // qwen3next is numerically unstable with OpenVINO SSM_CONV. + // Keep this op on CPU until the OpenVINO implementation is fixed. + return true; + } + case GGML_OP_VIEW: { + // Skip TOPK_MOE fused tests until it is fully supported + // the argsort_top_k VIEW wrapping ARGSORT is named "selected_experts" in test_topk_moe + if (strcmp(op->name, "selected_experts") == 0) { return true; } + break; + } + default: + break; } return false; } @@ -946,24 +1174,47 @@ static bool is_op_unsupported_case(const ggml_tensor * op) { static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { GGML_ASSERT(dev->reg != nullptr); - static std::set supported_types{GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_I64, - GGML_TYPE_I32, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, - GGML_TYPE_Q5_K, GGML_TYPE_Q8_0, GGML_TYPE_Q6_K}; - - static const std::set supported_ops{GGML_OP_NONE, GGML_OP_ADD, GGML_OP_MUL, GGML_OP_MUL_MAT, GGML_OP_VIEW, - /*GGML_OP_CONT,*/ GGML_OP_RESHAPE, GGML_OP_PERMUTE, GGML_OP_TRANSPOSE, - GGML_OP_GET_ROWS, GGML_OP_ROPE, GGML_OP_RMS_NORM, GGML_OP_SCALE, - // softmax is not updated due to replaced by flash_attn_ext - // GGML_OP_SOFT_MAX, - GGML_OP_SET_ROWS, GGML_OP_FLASH_ATTN_EXT, GGML_OP_CPY}; - static const std::set supported_unary_ops{ - GGML_UNARY_OP_GELU, - GGML_UNARY_OP_SILU, - }; - static const std::set supported_glu_ops{ - GGML_GLU_OP_SWIGLU, - GGML_GLU_OP_GEGLU, + static std::unordered_set supported_types{ + GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_I64, GGML_TYPE_I32, GGML_TYPE_Q4_0, + GGML_TYPE_Q4_1, GGML_TYPE_Q4_K, GGML_TYPE_Q5_1, GGML_TYPE_Q5_K, GGML_TYPE_Q8_0, GGML_TYPE_Q6_K}; + + // derive supported op sets from the op_table map, keys in + // the map use the full macro name (e.g. "GGML_OP_ADD"), while + // the ggml_*_op_name() helpers return only the trailing part (e.g. "ADD"). + // each set is built once and cached. + static const auto build_supported_sets = [] { + const auto & table = ov::frontend::ggml::get_supported_ops(); + std::unordered_set ops; + std::unordered_set unary_ops; + std::unordered_set glu_ops; + + // GGML_OP_NONE has no translator but is always safe to add to the supported set. + ops.insert(GGML_OP_NONE); + + for (int i = 0; i < GGML_OP_COUNT; ++i) { + const std::string key = std::string("GGML_OP_") + ggml_op_name(static_cast(i)); + if (table.count(key)) { + ops.insert(static_cast(i)); + } + } + for (int i = 0; i < GGML_UNARY_OP_COUNT; ++i) { + const std::string key = std::string("GGML_UNARY_OP_") + ggml_unary_op_name(static_cast(i)); + if (table.count(key)) { + unary_ops.insert(static_cast(i)); + } + } + for (int i = 0; i < GGML_GLU_OP_COUNT; ++i) { + const std::string key = std::string("GGML_GLU_OP_") + ggml_glu_op_name(static_cast(i)); + if (table.count(key)) { + glu_ops.insert(static_cast(i)); + } + } + return std::make_tuple(ops, unary_ops, glu_ops); }; + static const auto supported_sets = build_supported_sets(); + static const auto & supported_ops = std::get<0>(supported_sets); + static const auto & supported_unary_ops = std::get<1>(supported_sets); + static const auto & supported_glu_ops = std::get<2>(supported_sets); switch (op->op) { case GGML_OP_UNARY: { @@ -972,11 +1223,6 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con // GGML_LOG_WARN("OpenVINO backend does not support unary op %s\n", ggml_unary_op_name(ggml_get_unary_op(op))); return false; } - if (has_view_op_input(op)) { - // GGML_LOG_WARN("OpenVINO backend does not support unary op %s with view input\n", - // ggml_unary_op_name(ggml_get_unary_op(op))); - return false; - } break; } case GGML_OP_GLU: { @@ -1003,13 +1249,15 @@ static bool ggml_backend_openvino_device_supports_op(ggml_backend_dev_t dev, con return false; } static std::set ops_not_support_view_input{ - GGML_OP_GET_ROWS, - GGML_OP_RMS_NORM, + GGML_OP_L2_NORM, }; if (ops_not_support_view_input.find(op->op) != ops_not_support_view_input.end() && has_view_op_input(op)) { // GGML_LOG_WARN("OpenVINO backend does not support op %s with view input\n", ggml_op_name(op->op)); return false; } + if (op->op == GGML_OP_RMS_NORM && has_non_contiguous_view_input(op)) { + return false; + } } } diff --git a/ggml/src/ggml-openvino/ggml-quants.cpp b/ggml/src/ggml-openvino/ggml-quants.cpp index 57d66df4f017..275b95428273 100644 --- a/ggml/src/ggml-openvino/ggml-quants.cpp +++ b/ggml/src/ggml-openvino/ggml-quants.cpp @@ -126,6 +126,68 @@ void extract_q4_1_data(const ggml_tensor * tensor, } } +// Extracts (weight, scales, zp) from Q5_1 tensors. +// Data layout is: |16 bit scale|16 bit min|32 bit qh (5th bits)|32 x 4bit low nibbles|. +// Reconstructed quant q in [0,31]: q = (low nibble) | (qh_bit << 4). Dequant: w*d + m. +// Weights are stored as u8 (5-bit values do not fit u4), matching make_int8_weights. +void extract_q5_1_data(const ggml_tensor * tensor, + ov::Tensor & weights_arr, + ov::Tensor & scales_arr, + ov::Tensor & zp_arr, + bool use_bias) { + const uint64_t bytes_per_block = 24; // 2 scale + 2 min + 4 qh + 16 (32x0.5) weights + const int qk = 32; + + auto * data = static_cast(tensor->data); + auto * weights = static_cast(weights_arr.data()); // u8 weights, one byte per weight + auto * scales = scales_arr.data::value_type>(); + + // Read a 16-bit little-endian value without aliasing/const-qual violations. + auto read_u16 = [](const uint8_t * p) { + uint16_t v; + memcpy(&v, p, sizeof(v)); + return v; + }; + + auto unpack_block = [&](const uint8_t * block, uint8_t * dst) { + uint32_t qh; + memcpy(&qh, block + 4, sizeof(uint32_t)); + const uint8_t * qs = block + 8; + for (int j = 0; j < qk / 2; ++j) { + const uint8_t lo = qs[j] & 0x0F; + const uint8_t hi = qs[j] >> 4; + const uint8_t bit_lo = (qh >> j) & 1; + const uint8_t bit_hi = (qh >> (j + qk / 2)) & 1; + dst[j] = lo | (bit_lo << 4); // first 16 weights + dst[j + qk / 2] = hi | (bit_hi << 4); // last 16 weights + } + }; + + if (use_bias) { + // Store bias (min) directly as f16: dequant w*d + m + auto * bias = zp_arr.data::value_type>(); + ov::parallel_for(scales_arr.get_size(), [&](size_t i) { + const uint8_t * block = data + i * bytes_per_block; + float scale = static_cast(ov::float16::from_bits(read_u16(block))); + float min = static_cast(ov::float16::from_bits(read_u16(block + 2))); + scales[i] = ov::float16(scale); + bias[i] = ov::float16(min); + unpack_block(block, weights + i * qk); + }); + } else { + auto * zp = static_cast(zp_arr.data()); // u8 zero points + ov::parallel_for(scales_arr.get_size(), [&](size_t i) { + const uint8_t * block = data + i * bytes_per_block; + float scale = static_cast(ov::float16::from_bits(read_u16(block))); + float min = static_cast(ov::float16::from_bits(read_u16(block + 2))); + scales[i] = ov::float16(scale); + // zp = -min / scale (dequant: (w - zp) * s == w*s + min) + zp[i] = (scale != 0.0f) ? (uint8_t) std::lround(-min / scale) : 0; + unpack_block(block, weights + i * qk); + }); + } +} + // Extracts (weight, scales, zp) from Q8_0 tensors. // Data layout is: |16 bit scale|32 x 8bit weights|. // When zp_arr is empty (symmetric), weights are stored as signed i8 directly. @@ -577,6 +639,7 @@ std::shared_ptr extract_quantized_weights(const ggml_tensor * tensor, weights_per_block = 32; break; case GGML_TYPE_Q8_0: + case GGML_TYPE_Q5_1: case GGML_TYPE_Q5_K: is_u4 = false; weights_per_block = 32; @@ -601,6 +664,9 @@ std::shared_ptr extract_quantized_weights(const ggml_tensor * tensor, case GGML_TYPE_Q4_K: extract_q4_k_data(&temp_tensor, weights, scales, zp, use_bias); break; + case GGML_TYPE_Q5_1: + extract_q5_1_data(&temp_tensor, weights, scales, zp, use_bias); + break; case GGML_TYPE_Q8_0: extract_q8_0_data(&temp_tensor, weights, scales, zp); break; diff --git a/ggml/src/ggml-openvino/ggml-quants.h b/ggml/src/ggml-openvino/ggml-quants.h index e4a02297cae4..28b7c1213be2 100644 --- a/ggml/src/ggml-openvino/ggml-quants.h +++ b/ggml/src/ggml-openvino/ggml-quants.h @@ -6,7 +6,7 @@ #include #include -void unpack_32_4(const uint8_t* data, uint8_t* dst); +void unpack_32_4(const uint8_t * data, uint8_t * dst); void extract_q4_0_data(const ggml_tensor * tensor, ov::Tensor & weights_arr, @@ -19,12 +19,18 @@ void extract_q4_1_data(const ggml_tensor * tensor, ov::Tensor & zp_arr, bool use_bias = false); +void extract_q5_1_data(const ggml_tensor * tensor, + ov::Tensor & weights_arr, + ov::Tensor & scales_arr, + ov::Tensor & zp_arr, + bool use_bias = false); + void extract_q8_0_data(const ggml_tensor * tensor, ov::Tensor & weights_arr, ov::Tensor & scales_arr, ov::Tensor & zp_arr); -void unpack_256_4(const uint8_t* data, uint8_t* dst); +void unpack_256_4(const uint8_t * data, uint8_t * dst); void extract_q4_k_data(const ggml_tensor * tensor, ov::Tensor & weights_arr, @@ -145,8 +151,8 @@ namespace ov { namespace op { namespace util { // From /src/common/transformations/include/transformations/utils/utils.hpp -bool get_single_value(const std::shared_ptr& const_node, - float& value, +bool get_single_value(const std::shared_ptr & const_node, + float & value, bool check_value_range = true); } // namespace util } // namespace op diff --git a/ggml/src/ggml-openvino/openvino/decoder.h b/ggml/src/ggml-openvino/openvino/decoder.h index 3b8da2be5d2b..9d64fe575c4c 100644 --- a/ggml/src/ggml-openvino/openvino/decoder.h +++ b/ggml/src/ggml-openvino/openvino/decoder.h @@ -3,6 +3,8 @@ #include #include #include +#include +#include #include #include @@ -12,22 +14,50 @@ namespace ggml { class GgmlDecoder : public DecoderBase { public: - virtual ov::Any get_attribute(const std::string& name) const = 0; + virtual ov::Any get_attribute(const std::string & name) const = 0; - virtual PartialShape get_input_shape(int node_idx, const std::string& name) const = 0; + virtual PartialShape get_input_shape(int node_idx, const std::string & name) const = 0; - virtual std::vector get_input_stride(int node_idx, const std::string& name) const = 0; + virtual std::vector get_input_stride(int node_idx, const std::string & name) const = 0; - virtual element::Type get_input_type(int node_idx, const std::string& name) const = 0; + virtual size_t get_view_input_size(int node_idx, const std::string & name) const = 0; + + virtual size_t get_view_input_offset(int node_idx, const std::string & name, size_t view_index) const = 0; + + virtual size_t get_view_input_src_offset(int node_idx, const std::string & name, size_t view_index) const = 0; + + virtual std::vector get_view_input_stride(int node_idx, + const std::string & name, + size_t view_index) const = 0; + + virtual std::vector get_view_input_src_stride(int node_idx, + const std::string & name, + size_t view_index) const = 0; + + virtual Shape get_view_input_ggml_shape(int node_idx, const std::string & name, size_t view_index) const = 0; + + virtual Shape get_view_input_src_ggml_shape(int node_idx, const std::string & name, size_t view_index) const = 0; + + virtual PartialShape get_view_input_ov_shape(int node_idx, const std::string & name, size_t view_index) const = 0; + + virtual PartialShape get_view_input_src_ov_shape(int node_idx, + const std::string & name, + size_t view_index) const = 0; + + virtual std::string get_view_input_name(int node_idx, const std::string & name, size_t view_index) const = 0; + + virtual std::string get_view_input_src_name(int node_idx, const std::string & name, size_t view_index) const = 0; + + virtual element::Type get_input_type(int node_idx, const std::string & name) const = 0; virtual size_t get_input_size() const = 0; virtual size_t get_input_size(int node_idx) const = 0; virtual void get_input_node(size_t input_port_idx, - std::string& producer_name, - std::string& producer_output_port_name, - size_t& producer_output_port_index) const = 0; + std::string & producer_name, + std::string & producer_output_port_name, + size_t & producer_output_port_index) const = 0; virtual std::vector get_input_names(int node_idx) const = 0; @@ -35,30 +65,36 @@ class GgmlDecoder : public DecoderBase { virtual element::Type get_output_type(const int node_idx) const = 0; - virtual int32_t* get_input_op_params(int node_idx, const std::string& name) const = 0; + virtual std::vector get_output_stride(int node_idx) const = 0; + + virtual int32_t * get_input_op_params(int node_idx, const std::string & name) const = 0; virtual int32_t * get_output_op_params(int node_idx) const = 0; + virtual size_t get_output_op_offset(int node_idx) const = 0; + virtual std::vector get_output_names(int node_idx) const = 0; - virtual const std::string& get_op_type() const = 0; + virtual const std::string & get_op_type() const = 0; - virtual const std::string& get_op_type(int node_idx) const = 0; + virtual const std::string & get_op_type(int node_idx) const = 0; - virtual const std::string& get_op_name() const = 0; + virtual const std::string & get_op_name() const = 0; - virtual const std::string& get_op_name(int node_idx) const = 0; + virtual const std::string & get_op_name(int node_idx) const = 0; virtual void visit_subgraph(std::function, int node_idx)> node_visitor) const = 0; virtual int get_op_case(int node_idx) const = 0; - virtual const std::map>& get_model_inputs() const = 0; - virtual const std::map>& get_model_extra_inputs() const = 0; - virtual const std::map>& get_model_weights() const = 0; + virtual const std::map> & get_model_inputs() const = 0; + virtual const std::map> & get_model_extra_inputs() const = 0; + virtual const std::map> & get_model_weights() const = 0; virtual std::vector get_model_output_names() const = 0; - virtual int32_t* get_rope_params() const = 0; + virtual int32_t * get_rope_params() const = 0; + + virtual bool has_mixed_rope_params() const = 0; virtual std::map get_kv_param_res_names() const = 0; @@ -66,7 +102,11 @@ class GgmlDecoder : public DecoderBase { virtual bool is_stateful() const = 0; + virtual bool is_splited_model() const = 0; + virtual int is_swa_layer(int layer) const = 0; + + virtual int32_t get_op_dynamic_dim(int node_idx) const = 0; }; } // namespace ggml diff --git a/ggml/src/ggml-openvino/openvino/frontend.h b/ggml/src/ggml-openvino/openvino/frontend.h index f1c6f0c3e3ce..72134a3e8cf2 100644 --- a/ggml/src/ggml-openvino/openvino/frontend.h +++ b/ggml/src/ggml-openvino/openvino/frontend.h @@ -15,7 +15,7 @@ class FrontEnd { using Ptr = std::shared_ptr; FrontEnd(); - static std::shared_ptr convert(const InputModel::Ptr& model, bool naive = false); + static std::shared_ptr convert(const InputModel::Ptr & model, bool naive = false); }; } // namespace ggml diff --git a/ggml/src/ggml-openvino/openvino/input_model.h b/ggml/src/ggml-openvino/openvino/input_model.h index ce8434426c90..6ddcea996f03 100644 --- a/ggml/src/ggml-openvino/openvino/input_model.h +++ b/ggml/src/ggml-openvino/openvino/input_model.h @@ -1,9 +1,9 @@ #pragma once -#include - #include "decoder.h" +#include + namespace ov { namespace frontend { namespace ggml { @@ -16,9 +16,9 @@ class InputModel : public ov::frontend::InputModel { friend class ::ov::frontend::ggml::FrontEnd; public: - explicit InputModel(const std::shared_ptr& gdecoder); + explicit InputModel(const std::shared_ptr & gdecoder); - const std::shared_ptr& get_model_decoder() const; + const std::shared_ptr & get_model_decoder() const; private: std::shared_ptr m_decoder; diff --git a/ggml/src/ggml-openvino/openvino/node_context.h b/ggml/src/ggml-openvino/openvino/node_context.h index aa484128a952..9769c30096e9 100644 --- a/ggml/src/ggml-openvino/openvino/node_context.h +++ b/ggml/src/ggml-openvino/openvino/node_context.h @@ -1,11 +1,11 @@ #pragma once +#include "decoder.h" + #include #include #include -#include "decoder.h" - namespace ov { namespace frontend { namespace ggml { @@ -16,28 +16,24 @@ typedef std::map> TensorMap; class NodeContext : public frontend::NodeContext { public: - NodeContext(const std::shared_ptr& decoder, - std::shared_ptr& tensor_map, + NodeContext(const std::shared_ptr & decoder, + std::shared_ptr & tensor_map, int node_idx, - TranslateSession* translate_session = nullptr) - : ov::frontend::NodeContext(decoder->get_op_type(node_idx)), - m_decoder(decoder), - m_tensor_map(tensor_map), - m_node_idx(node_idx), - m_translate_session(translate_session) { + TranslateSession * translate_session = nullptr) : + ov::frontend::NodeContext(decoder->get_op_type(node_idx)), + m_decoder(decoder), + m_tensor_map(tensor_map), + m_node_idx(node_idx), + m_translate_session(translate_session) { m_input_names = decoder->get_input_names(m_node_idx); m_output_names = decoder->get_output_names(m_node_idx); } - TranslateSession* get_translate_session() const { - return m_translate_session; - } + TranslateSession * get_translate_session() const { return m_translate_session; } - const std::vector& get_input_names() const { return m_input_names; } + const std::vector & get_input_names() const { return m_input_names; } - size_t get_input_size() const override { - return m_decoder->get_input_size(m_node_idx); - } + size_t get_input_size() const override { return m_decoder->get_input_size(m_node_idx); } ov::element::Type get_input_type(size_t index) const { return m_decoder->get_input_type(m_node_idx, m_input_names[index]); @@ -55,42 +51,103 @@ class NodeContext : public frontend::NodeContext { PartialShape get_output_shape() const { return m_decoder->get_output_shape(m_node_idx); } - int32_t* get_input_op_params(size_t index) const { + int32_t * get_input_op_params(size_t index) const { return m_decoder->get_input_op_params(m_node_idx, m_input_names[index]); } - int32_t * get_output_op_params() const { return m_decoder->get_output_op_params(m_node_idx); } + size_t get_view_input_size(size_t index) const { + return m_decoder->get_view_input_size(m_node_idx, m_input_names[index]); + } + + size_t get_view_input_offset(size_t index, size_t view_index) const { + return m_decoder->get_view_input_offset(m_node_idx, m_input_names[index], view_index); + } - ov::element::Type get_output_type() const { - return m_decoder->get_output_type(m_node_idx); + size_t get_view_input_src_offset(size_t index, size_t view_index) const { + return m_decoder->get_view_input_src_offset(m_node_idx, m_input_names[index], view_index); } + std::vector get_view_input_stride(size_t index, size_t view_index) const { + return m_decoder->get_view_input_stride(m_node_idx, m_input_names[index], view_index); + } + + std::vector get_view_input_src_stride(size_t index, size_t view_index) const { + return m_decoder->get_view_input_src_stride(m_node_idx, m_input_names[index], view_index); + } + + ov::Shape get_view_input_ggml_shape(size_t index, size_t view_index) const { + return m_decoder->get_view_input_ggml_shape(m_node_idx, m_input_names[index], view_index); + } + + ov::Shape get_view_input_src_ggml_shape(size_t index, size_t view_index) const { + return m_decoder->get_view_input_src_ggml_shape(m_node_idx, m_input_names[index], view_index); + } + + ov::PartialShape get_view_input_ov_shape(size_t index, size_t view_index) const { + return m_decoder->get_view_input_ov_shape(m_node_idx, m_input_names[index], view_index); + } + + ov::PartialShape get_view_input_src_ov_shape(size_t index, size_t view_index) const { + return m_decoder->get_view_input_src_ov_shape(m_node_idx, m_input_names[index], view_index); + } + + std::string get_view_input_name(size_t index, size_t view_index) const { + return m_decoder->get_view_input_name(m_node_idx, m_input_names[index], view_index); + } + + std::string get_view_input_src_name(size_t index, size_t view_index) const { + return m_decoder->get_view_input_src_name(m_node_idx, m_input_names[index], view_index); + } + + int32_t get_op_dynamic_dim() const { return m_decoder->get_op_dynamic_dim(m_node_idx); } + + int32_t * get_output_op_params() const { return m_decoder->get_output_op_params(m_node_idx); } + + size_t get_output_op_offset() const { return m_decoder->get_output_op_offset(m_node_idx); } + + ov::element::Type get_output_type() const { return m_decoder->get_output_type(m_node_idx); } + + std::vector get_output_stride() const { return m_decoder->get_output_stride(m_node_idx); } + Output get_input(int idx) const override { + // Check if this input is a VIEW + size_t view_input_size = m_decoder->get_view_input_size(m_node_idx, m_input_names[idx]); + if (view_input_size > 0) { + // This is a VIEW input, get the base tensor name (last element in the chain) + std::string base_name = + m_decoder->get_view_input_src_name(m_node_idx, m_input_names[idx], view_input_size - 1); + // Check if the VIEW has been resolved (translate_view produced a Slice) + auto view_it = m_tensor_map->find(m_input_names[idx]); + if (!base_name.empty() && view_it != m_tensor_map->end()) { + auto base_it = m_tensor_map->find(base_name); + if (base_it != m_tensor_map->end() && + view_it->second.get_node_shared_ptr() != base_it->second.get_node_shared_ptr()) { + return view_it->second; + } + return base_it->second; + } + if (!base_name.empty()) { + return m_tensor_map->at(base_name); + } + } + // Not a VIEW or failed to get base name, use the original logic return m_tensor_map->at(m_input_names[idx]); } - Output get_input(const std::string& name) const override { + Output get_input(const std::string & name) const override { if (m_tensor_map->find(name) == m_tensor_map->end()) { throw std::runtime_error("'" + name + "' not found in tensor map."); } return m_tensor_map->at(name); } - bool has_input(const std::string& name) const { - return m_tensor_map->find(name) != m_tensor_map->end(); - } + bool has_input(const std::string & name) const { return m_tensor_map->find(name) != m_tensor_map->end(); } - const std::string& get_name() const override { - return m_decoder->get_op_name(m_node_idx); - } + const std::string & get_name() const override { return m_decoder->get_op_name(m_node_idx); } - ov::Any get_attribute_as_any(const std::string& name) const override { - return m_decoder->get_attribute(name); - } + ov::Any get_attribute_as_any(const std::string & name) const override { return m_decoder->get_attribute(name); } - int get_op_case() const { - return m_decoder->get_op_case(m_node_idx); - } + int get_op_case() const { return m_decoder->get_op_case(m_node_idx); } bool is_static() const { return m_decoder->is_static(); } @@ -98,14 +155,14 @@ class NodeContext : public frontend::NodeContext { private: std::shared_ptr m_decoder; - std::shared_ptr& m_tensor_map; + std::shared_ptr & m_tensor_map; int m_node_idx; - TranslateSession* m_translate_session; + TranslateSession * m_translate_session; std::vector m_input_names; std::vector m_output_names; }; -using CreatorFunction = std::function; +using CreatorFunction = std::function; } // namespace ggml } // namespace frontend diff --git a/ggml/src/ggml-openvino/openvino/op/add_id.cpp b/ggml/src/ggml-openvino/openvino/op/add_id.cpp new file mode 100644 index 000000000000..e54d700d421a --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/add_id.cpp @@ -0,0 +1,76 @@ +#include "../node_context.h" +#include "../op_table.h" +#include "../utils.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +static ov::Output reshape_add_id_input_to_2d(const ov::Output & input, + const ov::PartialShape & input_shape, + const std::vector & dims) { + const auto actual_shape = input.get_partial_shape(); + if (actual_shape.rank().is_static() && actual_shape.rank().get_length() == 2) { + return input; + } + + if (input_shape.rank().is_static() && input_shape.rank().get_length() == 2) { + return input; + } + + auto shape = std::make_shared(input, ov::element::i64); + return std::make_shared(input, get_dimensions(shape, dims), false); +} + +OutputVector translate_add_id(const NodeContext & context) { + num_inputs_check(context, 3, 3); + + auto input = process_view_input_new(context, 0); + auto bias = process_view_input_new(context, 1); + auto ids = process_view_input_new(context, 2); + + // OpenVINO uses reversed GGML dimensions: + // input: [1, n_token, n_used, n_embd] + // bias: [1, 1, n_expert, n_embd] + // ids: [1, 1, n_token, n_used] + // Model bias constants may already be stored as [n_expert, n_embd]. + bias = reshape_add_id_input_to_2d(bias, context.get_input_shape(1), {2, 3}); + ids = reshape_add_id_input_to_2d(ids, context.get_input_shape(2), {2, 3}); + + if (ids.get_element_type() != ov::element::i32 && ids.get_element_type() != ov::element::i64) { + ids = std::make_shared(ids, ov::element::i32); + } + + auto gather_axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {0}); + ov::Output selected_bias = std::make_shared(bias, ids, gather_axis); + selected_bias = std::make_shared( + selected_bias, std::make_shared(input, ov::element::i64), false); + + if (selected_bias.get_element_type() != input.get_element_type()) { + selected_bias = std::make_shared(selected_bias, input.get_element_type()); + } + + ov::Output res = std::make_shared(input, selected_bias); + const auto output_type = context.get_output_type(); + if (res.get_element_type() != output_type) { + res = std::make_shared(res, output_type); + } + + return rename_outputs_with_suffix({res}, context.get_name()); +} + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/argsort.cpp b/ggml/src/ggml-openvino/openvino/op/argsort.cpp new file mode 100644 index 000000000000..bb8344af8428 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/argsort.cpp @@ -0,0 +1,47 @@ +#include "../node_context.h" +#include "../op_table.h" +#include "../utils.h" +#include "ggml.h" + +#include +#include +#include +#include + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +OutputVector translate_argsort(const NodeContext & context) { + num_inputs_check(context, 1, 1); + + auto input = process_view_input_new(context, 0); + + const int32_t order = context.get_output_op_params()[0]; + + ov::op::v11::TopK::Mode mode; + switch (order) { + case GGML_SORT_ORDER_ASC: + mode = ov::op::v11::TopK::Mode::MIN; + break; + case GGML_SORT_ORDER_DESC: + mode = ov::op::v11::TopK::Mode::MAX; + break; + default: + FRONT_END_OP_CONVERSION_CHECK(false, "Unsupported GGML_OP_ARGSORT order: ", order); + } + + auto k = std::make_shared(get_dimensions(input.get_node_shared_ptr(), {3}), + ov::op::v0::Constant::create(ov::element::i64, {1}, {0})); + + auto topk = std::make_shared(input, k, 3, mode, ov::op::v11::TopK::SortType::SORT_VALUES, + context.get_output_type(), false); + + return rename_outputs_with_suffix({topk->output(1)}, context.get_name()); +} + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/clamp.cpp b/ggml/src/ggml-openvino/openvino/op/clamp.cpp new file mode 100644 index 000000000000..070ad33b7794 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/clamp.cpp @@ -0,0 +1,33 @@ +#include "../node_context.h" +#include "../op_table.h" +#include "../utils.h" + +#include +#include + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +OutputVector translate_clamp(const NodeContext & context) { + num_inputs_check(context, 1, 1); + + auto input = process_view_input_new(context, 0); + + const int32_t * op_params = context.get_output_op_params(); + FRONT_END_CHECK_IMPLEMENTED(op_params != nullptr, "CLAMP requires output op params"); + + float min; + float max; + std::memcpy(&min, reinterpret_cast(op_params) + 0, sizeof(float)); + std::memcpy(&max, reinterpret_cast(op_params) + 1, sizeof(float)); + + auto res = std::make_shared(input, min, max); + return rename_outputs_with_suffix({res}, context.get_name()); +} + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/concat.cpp b/ggml/src/ggml-openvino/openvino/op/concat.cpp new file mode 100644 index 000000000000..4d36a666b5e5 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/concat.cpp @@ -0,0 +1,48 @@ +#include "../node_context.h" +#include "../op_table.h" +#include "../utils.h" + +#include +#include +#include +#include + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +OutputVector translate_concat(const NodeContext & context) { + num_inputs_check(context, 2, 2); + + const int32_t * op_params = context.get_output_op_params(); + FRONT_END_CHECK_IMPLEMENTED(op_params != nullptr, "CONCAT requires output op params"); + + const auto output_shape = context.get_output_shape(); + FRONT_END_CHECK_IMPLEMENTED(output_shape.rank().is_static(), "CONCAT requires static output rank"); + + const auto rank = output_shape.rank().get_length(); + const int32_t ggml_dim = op_params[0]; + FRONT_END_CHECK_IMPLEMENTED(ggml_dim >= 0 && ggml_dim < rank, "CONCAT axis is out of range"); + + auto input_0 = process_view_input_new(context, 0); + auto input_1 = process_view_input_new(context, 1); + const auto output_type = context.get_output_type(); + + if (input_0.get_element_type() != output_type) { + input_0 = std::make_shared(input_0, output_type); + } + if (input_1.get_element_type() != output_type) { + input_1 = std::make_shared(input_1, output_type); + } + + const auto axis = static_cast(rank - 1 - ggml_dim); + auto res = std::make_shared(OutputVector{input_0, input_1}, axis); + + return rename_outputs_with_suffix({res}, context.get_name()); +} + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/cont.cpp b/ggml/src/ggml-openvino/openvino/op/cont.cpp index 6160dd744446..1d6cc6721260 100644 --- a/ggml/src/ggml-openvino/openvino/op/cont.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cont.cpp @@ -18,27 +18,19 @@ namespace op { OutputVector translate_cont(const NodeContext & context) { num_inputs_check(context, 1, 1); - int op_case = context.get_op_case(); - FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2 || op_case == 3, "Unsupported CONT case"); - auto src_shape = context.get_input_shape(0).to_shape(); auto dst_shape = context.get_output_shape().to_shape(); - ov::Output res; - if (op_case == 1) { - // The input comes from a PERMUTE - throw std::runtime_error("Code of this case might be outdated"); - dst_shape[1] = -1; - res = std::make_shared( - context.get_input(0), ov::op::v0::Constant::create(ov::element::i64, {dst_shape.size()}, dst_shape), false); - } else if (op_case == 2) { - // The input comes from a TRANSPOSE - return {context.get_input(0)}; - } else { - // The input comes from a VIEW - res = process_view_input(context, 0); + if (context.get_op_dynamic_dim() != -1) { + dst_shape[3 - context.get_op_dynamic_dim()] = -1; } + auto input = process_view_input_new(context, 0); + + ov::Output res; + res = std::make_shared( + input, ov::op::v0::Constant::create(ov::element::i64, {dst_shape.size()}, dst_shape), false); + return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/op/cpy.cpp b/ggml/src/ggml-openvino/openvino/op/cpy.cpp index 831117208be4..3a4355021d98 100644 --- a/ggml/src/ggml-openvino/openvino/op/cpy.cpp +++ b/ggml/src/ggml-openvino/openvino/op/cpy.cpp @@ -3,7 +3,9 @@ #include "../utils.h" #include +#include #include +#include namespace ov { namespace frontend { @@ -11,7 +13,18 @@ namespace ggml { namespace op { OutputVector translate_cpy(const NodeContext & context) { - auto res = std::make_shared(context.get_input(0), context.get_output_type()); + auto input = process_view_input_new(context, 0); + auto input_shape = context.get_input_shape(0); + auto output_shape = context.get_output_shape(); + + // Non-cast CPY may need a reshape (e.g. [3,192,1,1] -> [576,1,1,1]) + if (input_shape != output_shape) { + auto new_shape = ov::op::v0::Constant::create( + ov::element::i64, {static_cast(output_shape.rank().get_length())}, output_shape.to_shape()); + input = std::make_shared(input, new_shape, false); + } + + auto res = std::make_shared(input, context.get_output_type()); return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/op/div.cpp b/ggml/src/ggml-openvino/openvino/op/div.cpp new file mode 100644 index 000000000000..11dd9decec7a --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/div.cpp @@ -0,0 +1,146 @@ +#include "../node_context.h" +#include "../op_table.h" +#include "../utils.h" +#include "ggml.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +namespace { + +bool is_silu_div_pattern(const ov::Output & numerator, + const ov::Output & denominator, + const NodeContext & context) { + if (context.get_input_size() != 2) { + return false; + } + + const auto * unary_op = reinterpret_cast(context.get_input_op_params(0)); + if (unary_op == nullptr || *unary_op != GGML_UNARY_OP_SILU) { + return false; + } + + auto mul = std::dynamic_pointer_cast(numerator.get_node_shared_ptr()); + if (!mul) { + return false; + } + + const auto denom_node = denominator.get_node_shared_ptr(); + const auto mul_input_0 = mul->input_value(0).get_node_shared_ptr(); + const auto mul_input_1 = mul->input_value(1).get_node_shared_ptr(); + + auto sigmoid = std::dynamic_pointer_cast(mul_input_1); + if (mul_input_0 == denom_node && sigmoid && sigmoid->input_value(0).get_node_shared_ptr() == denom_node) { + return true; + } + + sigmoid = std::dynamic_pointer_cast(mul_input_0); + return mul_input_1 == denom_node && sigmoid && sigmoid->input_value(0).get_node_shared_ptr() == denom_node; +} + +ov::Output repeat_input_to_match(const NodeContext & context, + const ov::Output & input, + const ov::Output & target, + size_t input_index) { + const auto input_shape = context.get_input_shape(input_index); + const auto target_shape = context.get_input_shape(0); + + if (input_shape == target_shape) { + return input; + } + + if (input_shape.rank().is_static() && target_shape.rank().is_static()) { + const auto rank = static_cast(input_shape.rank().get_length()); + std::vector repeats(rank, 1); + bool needs_repeat = false; + + for (size_t axis = 0; axis < rank; ++axis) { + FRONT_END_OP_CONVERSION_CHECK(input_shape[axis].is_static() && target_shape[axis].is_static(), + "DIV repeat requires static dimensions on both inputs"); + + const int64_t input_dim = input_shape[axis].get_length(); + const int64_t target_dim = target_shape[axis].get_length(); + + FRONT_END_OP_CONVERSION_CHECK(input_dim > 0 && target_dim > 0 && target_dim % input_dim == 0, + "DIV input shape ", input_shape, " cannot repeat to match ", target_shape); + + repeats[axis] = target_dim / input_dim; + needs_repeat = needs_repeat || repeats[axis] != 1; + } + + if (!needs_repeat) { + return input; + } + + auto repeats_node = ov::op::v0::Constant::create(ov::element::i64, {repeats.size()}, repeats); + return std::make_shared(input, repeats_node); + } + + auto input_shape_node = std::make_shared(input, ov::element::i64); + auto target_shape_node = std::make_shared(target, ov::element::i64); + auto repeats_node = std::make_shared(target_shape_node, input_shape_node); + return std::make_shared(input, repeats_node); +} + +} // namespace + +OutputVector translate_div(const NodeContext & context) { + num_inputs_check(context, 2, 2); + + auto input_0 = process_view_input_new(context, 0); + auto input_1 = process_view_input_new(context, 1); + + if (is_silu_div_pattern(input_0, input_1, context)) { + ov::Output res = std::make_shared(input_1); + if (res.get_element_type() != context.get_output_type()) { + res = std::make_shared(res, context.get_output_type()); + } + return rename_outputs_with_suffix({res}, context.get_name()); + } + + input_1 = repeat_input_to_match(context, input_1, input_0, 1); + + const auto output_type = context.get_output_type(); + const bool use_f32_compute = input_0.get_element_type() != ov::element::f32 || + input_1.get_element_type() != ov::element::f32 || output_type != ov::element::f32; + + if (use_f32_compute) { + input_0 = std::make_shared(input_0, ov::element::f32); + input_1 = std::make_shared(input_1, ov::element::f32); + } + + ov::Output res = std::make_shared(input_0, input_1); + if (use_f32_compute) { + // Keep the reciprocal/divide path in FP32. Without this hint, the GPU + // plugin can still compress the subgraph back to FP16 and overflow on + // small shexp gate values (e.g. silu(x) / x in qwen2moe). + ov::mark_as_precision_sensitive(res.get_node_shared_ptr()->input(0)); + ov::mark_as_precision_sensitive(res.get_node_shared_ptr()->input(1)); + } + if (res.get_element_type() != output_type) { + auto output_convert = std::make_shared(res, output_type); + if (use_f32_compute) { + ov::mark_as_precision_sensitive(output_convert->input(0)); + } + res = output_convert; + } + return rename_outputs_with_suffix({res}, context.get_name()); +} + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp index 42602a730a4f..582df0130b59 100644 --- a/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp +++ b/ggml/src/ggml-openvino/openvino/op/flash_attn_ext.cpp @@ -1,15 +1,21 @@ #include "../node_context.h" #include "../op_table.h" #include "../utils.h" +#include "ggml-openvino/ggml-openvino-extra.h" #include +#include #include +#include #include #include #include #include +#include +#include #include #include +#include #include #include #include @@ -34,36 +40,115 @@ OutputVector translate_flash_attn_ext(const NodeContext & context) { auto q = std::make_shared(q_f32, ov::element::f16); auto scale_node = std::make_shared(ov::element::f16, ov::Shape{}, std::vector{scale}); - ov::Output mask_sliced, res; + ov::Output res; + + // For stateful std::string mask_name = "KQ_mask_sliced"; if (context.get_input_names()[3].find("swa") != std::string::npos) { mask_name = "KQ_mask_swa_sliced"; } if (context.has_input(mask_name)) { - mask_sliced = context.get_input(mask_name); - } else { - auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); - auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); - auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); - auto token_len = get_dimensions(q, {2}); - mask_sliced = std::make_shared(mask, zero, token_len, one, two); + mask = context.get_input(mask_name); } - if (mask_sliced.get_element_type() != ov::element::f16) { - mask_sliced = std::make_shared(mask_sliced, ov::element::f16); + if (mask.get_element_type() != ov::element::f16) { + mask = std::make_shared(mask, ov::element::f16); } - auto tile_kv = [&](int64_t num_heads, int64_t num_heads_kv, int64_t head_size, ov::Output kv) { - int64_t factor = num_heads / num_heads_kv; - if (factor > 1 && num_heads_kv > 1) { + //auto tile_kv = [&](int64_t num_heads, int64_t num_heads_kv, int64_t head_size, ov::Output kv) { + // int64_t factor = num_heads / num_heads_kv; + // if (factor > 1 && num_heads_kv > 1) { + auto q_shape = context.get_input_shape(0).to_shape(); + auto k_shape = context.get_input_shape(1).to_shape(); + const int64_t num_heads = q_shape[1]; + const int64_t num_heads_kv = k_shape[1]; + const int64_t head_size = q_shape[3]; + const int64_t factor = num_heads / num_heads_kv; + + // Manual GQA attention: enabled by default on GPU in stateless mode. + // Set GGML_OPENVINO_MANUAL_GQA_ATTN to a positive value (e.g. 1) to force-enable, + // or to 0 to force-disable. Unset falls back to the device-based default. + static const bool manual_gqa_enabled = []() { + const char * env = ggml_openvino_getenv_str("GGML_OPENVINO_MANUAL_GQA_ATTN"); + if (env != nullptr) { + return ggml_openvino_getenv_int("GGML_OPENVINO_MANUAL_GQA_ATTN") > 0; + } + const char * dev = ggml_openvino_getenv_str("GGML_OPENVINO_DEVICE"); + return dev != nullptr && std::string(dev) == "GPU"; + }(); + const bool use_manual_gqa_attention = + manual_gqa_enabled && factor > 1 && num_heads_kv > 1 && !context.is_stateful(); + + if (use_manual_gqa_attention) { + // Q, K, V arrive as [B, n_heads(_kv), S, head_size], where B is the active + // batch (n_seq_active) and may be > 1 (llama-perplexity, llama-server -np > 1) + // or dynamic. Reshape to + // K_r: [B, num_heads_kv, 1, S, head_size] + // Q_r: [B, num_heads_kv, factor, S_q, head_size] + // and let MatMul broadcast across the factor dim without materialising + // an expanded K/V. The leading 0 + special_zero=true copies B at runtime, + // so this is correct for B == 1, B > 1, and dynamic B alike. Only the head + // dims and head_size are baked in as literals; the sequence dim stays -1. + auto k_5d_shape = ov::op::v0::Constant::create(ov::element::i64, {5}, + std::vector{0, num_heads_kv, 1, -1, head_size}); + auto v_5d_shape = ov::op::v0::Constant::create(ov::element::i64, {5}, + std::vector{0, num_heads_kv, 1, -1, head_size}); + auto q_5d_shape = ov::op::v0::Constant::create(ov::element::i64, {5}, + std::vector{0, num_heads_kv, factor, -1, head_size}); + + auto k_r = std::make_shared(k, k_5d_shape, true); + auto v_r = std::make_shared(v, v_5d_shape, true); + auto q_r = std::make_shared(q, q_5d_shape, true); + + // QK^T → [B, num_heads_kv, factor, S_q, S_k] + auto qk = std::make_shared(q_r, k_r, /*tA=*/false, /*tB=*/true); + auto qk_scaled = std::make_shared(qk, scale_node); + + // Mask arrives as [B, 1, S_q, S_k]. Unsqueeze a factor axis at position 2 to + // get [B, 1, 1, S_q, S_k], which NUMPY-broadcasts cleanly against the + // [B, num_heads_kv, factor, S_q, S_k] scores: B==B, then 1→num_heads_kv and + // 1→factor on the head dims. + auto mask_unsq1 = + std::make_shared(mask, ov::op::v0::Constant::create(ov::element::i64, {1}, {2})); + // mask_unsq1: [B, 1, 1, S_q, S_k] (rank 5) + ov::Output qk_masked = std::make_shared(qk_scaled, mask_unsq1); + + auto softmax = std::make_shared(qk_masked, /*axis=*/-1); + + // softmax @ V → [B, num_heads_kv, factor, S_q, head_size] + auto attn = std::make_shared(softmax, v_r); + + // Reshape back to [B, num_heads, S_q, head_size] (combine num_heads_kv * factor). + // Leading 0 + special_zero=true copies B at runtime. + auto out_4d_shape = + ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector{0, num_heads, -1, head_size}); + auto out_4d = std::make_shared(attn, out_4d_shape, true); + + // The standard SDPA path's downstream is Transpose(0,2,1,3) → Convert(f32). + // Replicate it here so callers see the same output layout/dtype. + res = std::make_shared( + out_4d, ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3})); + res = std::make_shared(res, ov::element::f32); + return rename_outputs_with_suffix({res}, context.get_name()); + } + + // Default path: explicit Broadcast → SDPA. Kept as the fallback because + // (a) it goes through the GPU plugin's micro-SDPA fast path (FlashAttention + // tiles via DPAS), and (b) the manual path above is still being validated. + auto tile_kv = [&](int64_t n_heads, int64_t n_heads_kv, int64_t hs, ov::Output kv) { + int64_t f = n_heads / n_heads_kv; + if (f > 1 && n_heads_kv > 1) { ov::Output kv_broadcast_shape, kv_unsqueezed, new_kv_shape; auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, Shape{}, {2}); kv_unsqueezed = std::make_shared(kv, unsqueeze_axes); - kv_broadcast_shape = ov::op::v0::Constant::create( - ov::element::i64, {5}, {(int64_t) 1, (int64_t) 1, factor, (int64_t) 1, (int64_t) 1}); + kv_broadcast_shape = ov::op::v0::Constant::create(ov::element::i64, {5}, + {(int64_t) 1, (int64_t) 1, f, (int64_t) 1, (int64_t) 1}); new_kv_shape = - ov::op::v0::Constant::create(ov::element::i64, {4}, {(int64_t) 0, num_heads, (int64_t) -1, head_size}); + ov::op::v0::Constant::create(ov::element::i64, {4}, {(int64_t) 0, n_heads, (int64_t) -1, hs}); + // ov::element::i64, {5}, {(int64_t) 1, (int64_t) 1, factor, (int64_t) 1, (int64_t) 1}); + //new_kv_shape = + // ov::op::v0::Constant::create(ov::element::i64, {4}, {(int64_t) 0, num_heads, (int64_t) -1, head_size}); kv = std::make_shared(kv_unsqueezed, kv_broadcast_shape, ov::op::BroadcastType::BIDIRECTIONAL); @@ -72,12 +157,14 @@ OutputVector translate_flash_attn_ext(const NodeContext & context) { return kv; }; - auto q_shape = context.get_input_shape(0).to_shape(); - auto k_shape = context.get_input_shape(1).to_shape(); - k = tile_kv(q_shape[1], k_shape[1], q_shape[3], k); - v = tile_kv(q_shape[1], k_shape[1], q_shape[3], v); + //auto q_shape = context.get_input_shape(0).to_shape(); + //auto k_shape = context.get_input_shape(1).to_shape(); + //k = tile_kv(q_shape[1], k_shape[1], q_shape[3], k); + //v = tile_kv(q_shape[1], k_shape[1], q_shape[3], v); + k = tile_kv(num_heads, num_heads_kv, head_size, k); + v = tile_kv(num_heads, num_heads_kv, head_size, v); - auto sdpa = std::make_shared(q, k, v, mask_sliced, scale_node, false); + auto sdpa = std::make_shared(q, k, v, mask, scale_node, false); res = std::make_shared(sdpa, ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3})); res = std::make_shared(res, ov::element::f32); diff --git a/ggml/src/ggml-openvino/openvino/op/gated_delta_net.cpp b/ggml/src/ggml-openvino/openvino/op/gated_delta_net.cpp new file mode 100644 index 000000000000..26c4bbfa9850 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/gated_delta_net.cpp @@ -0,0 +1,282 @@ +#include "gated_delta_net.hpp" + +#include "../node_context.h" +#include "../op_table.h" +#include "../utils.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +static OutputVector translate_gated_delta_net_ref(const NodeContext & context); + +OutputVector translate_gated_delta_net(const NodeContext & context) { + // auto v_shape = context.get_input_shape(2).to_shape(); // [B, T, H_v, S_v] + // auto q_shape = context.get_input_shape(0).to_shape(); // [B, T, H_k, S_k] + + // // Fused GatedDeltaNet op only supports scalar gate (kda=0). + // // Fall back to reference implementation for per-key-dimension gating. + // // if (kda) { + // // return translate_gated_delta_net_ref(context); + // // } + + // auto q = context.get_input(0); + // auto k = context.get_input(1); + // auto v = context.get_input(2); + // auto g = context.get_input(3); + // auto beta = context.get_input(4); + // auto state = context.get_input(5); + + // const int64_t B = v_shape[0]; + // const int64_t T = v_shape[1]; + // const int64_t H_v = v_shape[2]; + // const int64_t S_v = v_shape[3]; + // const int64_t S_k = q_shape[3]; + + // // ggml state layout (OV notation): [B, H_v, value_dim, key_dim] + // // GatedDeltaNet op expects: [B, H_v, key_dim, value_dim] + // auto state_reshape_shape = + // ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector{B, H_v, S_v, S_k}); + // state = std::make_shared(state, state_reshape_shape, false); + // auto state_perm = ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector{0, 1, 3, 2}); + // state = std::make_shared(state, state_perm); + + // g = std::make_shared(g, ov::op::v0::Constant::create(ov::element::i64, {1}, {3})); + // beta = std::make_shared(beta, ov::op::v0::Constant::create(ov::element::i64, {1}, {3})); + + // auto gdn = std::make_shared(q, k, v, state, g, beta); + + // auto attn_4d = gdn->output(0); + // auto state_4d = gdn->output(1); // [B, H_v, key_dim, value_dim] + // // Transpose output state back to ggml layout [B, H_v, value_dim, key_dim] + // auto state_transposed = std::make_shared(state_4d, state_perm); + // auto flat_shape_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1}); + // auto attn = std::make_shared(attn_4d, flat_shape_1d, false); + // auto new_state = std::make_shared(state_transposed, flat_shape_1d, false); + // auto packed = std::make_shared(ov::OutputVector{attn, new_state}, 0); + // auto out_shape = + // ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector{1, 1, T * B + S_v * B, S_v * H_v}); + // auto res = std::make_shared(packed, out_shape, false); + + // return rename_outputs_with_suffix({res}, context.get_name()); + + // The OV version in CI does not have the GatedDeltaNet op, so use reference implementation for now. + return translate_gated_delta_net_ref(context); +} + +static OutputVector translate_gated_delta_net_ref(const NodeContext & context) { + num_inputs_check(context, 6, 6); + + // Inputs (OV shapes are reversed from ggml): + // ggml: q[S_k, H_k, T, B], k[S_k, H_k, T, B], v[S_v, H_v, T, B] + // OV: q[B, T, H_k, S_k], k[B, T, H_k, S_k], v[B, T, H_v, S_v] + // ggml: g[1 or S_v, H_v, T, B], beta[1, H_v, T, B] + // OV: g[B, T, H_v, 1 or S_v], beta[B, T, H_v, 1] + // ggml: state[S_v, S_v, H_v, B] + // OV: state[B, H_v, S_v, S_v] + auto q = process_view_input_new(context, 0); + auto k = process_view_input_new(context, 1); + auto v = process_view_input_new(context, 2); + auto g = process_view_input_new(context, 3); + auto beta = process_view_input_new(context, 4); + auto state = process_view_input_new(context, 5); + + auto v_shape = context.get_input_shape(2).to_shape(); // [B, T, H_v, S_v] + auto q_shape = context.get_input_shape(0).to_shape(); // [B, T, H_k, S_k] + auto g_shape = context.get_input_shape(3).to_shape(); // [B, T, H_v, 1 or S_v] + + const int64_t B = v_shape[0]; + const int64_t T = v_shape[1]; + const int64_t H_v = v_shape[2]; + const int64_t S_v = v_shape[3]; + const int64_t H_k = q_shape[2]; + const bool kda = (g_shape[3] == (size_t) S_v); + + const int64_t rq1 = H_v / H_k; // head repeat factor + const float scale = 1.0f / std::sqrt((float) S_v); + + auto axis_1 = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + auto axis_2 = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); + + // Transpose inputs from [B, T, H, S] to [B, H, T, S] for easier per-head processing + auto perm_0213 = ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector{0, 2, 1, 3}); + auto q_t = std::make_shared(q, perm_0213); // [B, H_k, T, S_k] + auto k_t = std::make_shared(k, perm_0213); // [B, H_k, T, S_k] + auto v_t = std::make_shared(v, perm_0213); // [B, H_v, T, S_v] + auto g_t = std::make_shared(g, perm_0213); // [B, H_v, T, 1 or S_v] + auto beta_t = std::make_shared(beta, perm_0213); // [B, H_v, T, 1] + + // Broadcast Q, K heads to match V heads if GQA is used (H_v > H_k) + ov::Output q_bh = q_t; + ov::Output k_bh = k_t; + if (rq1 > 1) { + auto q_unsq = std::make_shared(q_t, axis_2); // [B, H_k, 1, T, S] + auto k_unsq = std::make_shared(k_t, axis_2); // [B, H_k, 1, T, S] + + auto bcast_shape = ov::op::v0::Constant::create(ov::element::i64, {5}, std::vector{1, 1, rq1, 1, 1}); + auto q_bcast = + std::make_shared(q_unsq, bcast_shape, ov::op::BroadcastType::BIDIRECTIONAL); + auto k_bcast = + std::make_shared(k_unsq, bcast_shape, ov::op::BroadcastType::BIDIRECTIONAL); + + // Transpose [B, H_k, rq1, T, S] -> [B, rq1, H_k, T, S] so that reshape merges + // as [rq1, H_k] giving repeat-blocks pattern matching CPU: iq1 = iv1 % H_k + auto perm_5d = ov::op::v0::Constant::create(ov::element::i64, {5}, std::vector{0, 2, 1, 3, 4}); + auto q_transposed = std::make_shared(q_bcast, perm_5d); + auto k_transposed = std::make_shared(k_bcast, perm_5d); + + auto new_shape = ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector{B, H_v, T, S_v}); + q_bh = std::make_shared(q_transposed, new_shape, false); + k_bh = std::make_shared(k_transposed, new_shape, false); + } + + // Merge batch and head dims: [B*H_v, T, S_v] + auto merge_bh = [&](ov::Output x, int64_t last_dim) { + auto shape = ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{B * H_v, T, last_dim}); + return std::make_shared(x, shape, false); + }; + + auto q_m = merge_bh(q_bh, S_v); // [B*H_v, T, S_v] + auto k_m = merge_bh(k_bh, S_v); // [B*H_v, T, S_v] + auto v_m = merge_bh(v_t, S_v); // [B*H_v, T, S_v] + auto g_m = merge_bh(g_t, kda ? S_v : 1); // [B*H_v, T, 1 or S_v] + auto beta_m = merge_bh(beta_t, 1); // [B*H_v, T, 1] + + // State: [B, H_v, S_v, S_v] -> [B*H_v, S_v, S_v] + auto state_shape = ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{B * H_v, S_v, S_v}); + auto state_m = std::make_shared(state, state_shape, false); + + auto scale_const = ov::op::v0::Constant::create(ov::element::f32, {}, std::vector{scale}); + + // --- Build Loop body --- + // Body parameters (no iteration counter needed, use -1 in special ports) + auto body_state = std::make_shared(ov::element::f32, ov::PartialShape::dynamic()); + auto body_q = std::make_shared(ov::element::f32, ov::PartialShape::dynamic()); + auto body_k = std::make_shared(ov::element::f32, ov::PartialShape::dynamic()); + auto body_v = std::make_shared(ov::element::f32, ov::PartialShape::dynamic()); + auto body_g = std::make_shared(ov::element::f32, ov::PartialShape::dynamic()); + auto body_beta = std::make_shared(ov::element::f32, ov::PartialShape::dynamic()); + auto body_iter = std::make_shared(ov::element::i64, ov::Shape{1}); + + // Condition output (always true - we rely on trip_count for termination) + auto body_cond_out = ov::op::v0::Constant::create(ov::element::boolean, ov::Shape{1}, std::vector{true}); + + // Gather current token from invariant inputs using iteration counter + auto q_t_cur = std::make_shared(body_q, body_iter, axis_1); // [B*H_v, 1, S_v] + auto k_t_cur = std::make_shared(body_k, body_iter, axis_1); // [B*H_v, 1, S_v] + auto v_t_cur = std::make_shared(body_v, body_iter, axis_1); // [B*H_v, 1, S_v] + auto g_t_cur = std::make_shared(body_g, body_iter, axis_1); // [B*H_v, 1, 1 or S_v] + auto b_t_cur = std::make_shared(body_beta, body_iter, axis_1); // [B*H_v, 1, 1] + + // Squeeze token dim + auto q_cur = std::make_shared(q_t_cur, axis_1); // [B*H_v, S_v] + auto k_cur = std::make_shared(k_t_cur, axis_1); // [B*H_v, S_v] + auto v_cur = std::make_shared(v_t_cur, axis_1); // [B*H_v, S_v] + auto g_cur = std::make_shared(g_t_cur, axis_1); // [B*H_v, 1 or S_v] + auto b_cur = std::make_shared(b_t_cur, axis_1); // [B*H_v, 1] + + // Step 1: Apply decay gate to state + auto exp_g = std::make_shared(g_cur); // [B*H_v, 1 or S_v] + auto exp_g_unsq = std::make_shared(exp_g, axis_1); // [B*H_v, 1, 1 or S_v] + auto state_decayed = std::make_shared(body_state, exp_g_unsq); // [B*H_v, S_v, S_v] + + // Step 2: delta = (v - S @ k) * beta + auto k_col = std::make_shared(k_cur, axis_2); // [B*H_v, S_v, 1] + auto sk = std::make_shared(state_decayed, k_col, false, false); // [B*H_v, S_v, 1] + auto sk_sq = std::make_shared(sk, axis_2); // [B*H_v, S_v] + auto v_minus_sk = std::make_shared(v_cur, sk_sq); // [B*H_v, S_v] + auto delta = std::make_shared(v_minus_sk, b_cur); // [B*H_v, S_v] + + // Step 3: state += outer(delta, k) + auto delta_col = std::make_shared(delta, axis_2); // [B*H_v, S_v, 1] + auto k_row = std::make_shared(k_cur, axis_1); // [B*H_v, 1, S_v] + auto outer_prod = std::make_shared(delta_col, k_row, false, false); // [B*H_v, S_v, S_v] + auto state_updated = std::make_shared(state_decayed, outer_prod); // [B*H_v, S_v, S_v] + + // Step 4: attn_out = S @ q * scale + auto q_col = std::make_shared(q_cur, axis_2); // [B*H_v, S_v, 1] + auto sq = std::make_shared(state_updated, q_col, false, false); // [B*H_v, S_v, 1] + auto sq_squeezed = std::make_shared(sq, axis_2); // [B*H_v, S_v] + auto attn_out = std::make_shared(sq_squeezed, scale_const); // [B*H_v, S_v] + + // Unsqueeze attn_out to [B*H_v, 1, S_v] for scan output concatenation + auto attn_out_unsq = std::make_shared(attn_out, axis_1); // [B*H_v, 1, S_v] + + // --- Assemble Loop --- + // Body: results = [condition, state_updated, attn_out_unsq] + auto body = std::make_shared( + ov::OutputVector{body_cond_out, state_updated, attn_out_unsq}, + ov::ParameterVector{body_iter, body_state, body_q, body_k, body_v, body_g, body_beta}); + + auto trip_count = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, std::vector{T}); + auto exec_cond = ov::op::v0::Constant::create(ov::element::boolean, ov::Shape{1}, std::vector{true}); + + auto loop = std::make_shared(trip_count, exec_cond); + loop->set_function(body); + loop->set_special_body_ports(ov::op::v5::Loop::SpecialBodyPorts{0, 0}); + + // Carried state: feeds back from body output 1 to body_state param + loop->set_merged_input(body_state, state_m, state_updated); + // Invariant inputs: passed through unchanged each iteration + loop->set_invariant_input(body_q, q_m); + loop->set_invariant_input(body_k, k_m); + loop->set_invariant_input(body_v, v_m); + loop->set_invariant_input(body_g, g_m); + loop->set_invariant_input(body_beta, beta_m); + + // Loop outputs: + // 1) Final state (last iteration value of state_updated) + auto final_state_out = loop->get_iter_value(state_updated, -1); // [B*H_v, S_v, S_v] + // 2) Concatenated attention outputs across all iterations along axis 1 + auto attn_concat_out = loop->get_concatenated_slices(attn_out_unsq, 0, 1, 1, -1, 1); // [B*H_v, T, S_v] + + // --- Pack outputs to match ggml layout --- + // ggml output ne = {S_v*H, T*B + S_v*B, 1, 1} -> OV [1, 1, T*B+S_v*B, S_v*H_v] + // attn: [B, T, H_v, S_v] row-major, state: [B, H_v, S_v, S_v] row-major + + // attn: [B*H_v, T, S_v] -> [B, H_v, T, S_v] -> transpose to [B, T, H_v, S_v] -> flatten + auto attn_4d_shape = ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector{B, H_v, T, S_v}); + auto attn_4d = std::make_shared(attn_concat_out, attn_4d_shape, false); + auto attn_perm = std::make_shared(attn_4d, perm_0213); // [B, T, H_v, S_v] + + auto flat_shape_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{-1}); + auto attn_1d = std::make_shared(attn_perm, flat_shape_1d, false); + + // state: [B*H_v, S_v, S_v] -> [B, H_v, S_v, S_v] -> flatten + auto state_4d_shape = ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector{B, H_v, S_v, S_v}); + auto state_4d = std::make_shared(final_state_out, state_4d_shape, false); + auto state_1d = std::make_shared(state_4d, flat_shape_1d, false); + + // Concat [attn | state] and reshape to final output + auto packed = std::make_shared(ov::OutputVector{attn_1d, state_1d}, 0); + auto out_shape = + ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector{1, 1, T * B + S_v * B, S_v * H_v}); + auto res = std::make_shared(packed, out_shape, false); + + return rename_outputs_with_suffix({res}, context.get_name()); +} + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/gated_delta_net.hpp b/ggml/src/ggml-openvino/openvino/op/gated_delta_net.hpp new file mode 100644 index 000000000000..20a4cfdfe743 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/gated_delta_net.hpp @@ -0,0 +1,65 @@ +#pragma once + +#include "openvino/op/op.hpp" + +namespace ov::op::internal { +/// \note GatedDeltaNet op class is under development and subject to change +/// +/// \brief Operator performing Gated Delta Net computation +/// \ingroup ov_ops_cpp_api +class OPENVINO_API GatedDeltaNet : public ov::op::Op { +public: + OPENVINO_OP("GatedDeltaNet") + + GatedDeltaNet() = default; + /// \brief Constructs a GatedDeltaNet operation. + /// + /// \param query Query tensor input. + /// \param key Key tensor input. + /// \param value Value tensor input. + /// \param recurrent_state Initial recurrent state tensor. + /// \param gate Gate tensor controlling state decay/update. + /// \param beta Beta tensor scaling the delta update. + /// \param fuse_qk_l2norm Enables fusing q/k L2-normalization into this op. + /// \param q_l2_norm_eps Epsilon used for query L2-normalization when fusion is enabled. + /// \param k_l2_norm_eps Epsilon used for key L2-normalization when fusion is enabled. + GatedDeltaNet(const Output& query, + const Output& key, + const Output& value, + const Output& recurrent_state, + const Output& gate, + const Output& beta, + const bool fuse_qk_l2norm = false, + const float q_l2_norm_eps = 1e-6F, + const float k_l2_norm_eps = 1e-6F); + + /// \brief Constructs a GatedDeltaNet operation from input vector. + /// + /// \param args Input tensor vector in order: query, key, value, recurrent_state, gate, beta. + /// \param fuse_qk_l2norm Enables fusing q/k L2-normalization into this op. + /// \param q_l2_norm_eps Epsilon used for query L2-normalization when fusion is enabled. + /// \param k_l2_norm_eps Epsilon used for key L2-normalization when fusion is enabled. + GatedDeltaNet(const ov::OutputVector& args, + const bool fuse_qk_l2norm = false, + const float q_l2_norm_eps = 1e-6F, + const float k_l2_norm_eps = 1e-6F); + void validate_and_infer_types() override; + bool visit_attributes(AttributeVisitor& visitor) override; + std::shared_ptr clone_with_new_inputs(const ov::OutputVector& new_args) const override; + bool get_fuse_qk_l2norm() const { + return m_fuse_qk_l2norm; + } + float get_q_l2_norm_eps() const { + return m_q_l2_norm_eps; + } + float get_k_l2_norm_eps() const { + return m_k_l2_norm_eps; + } + +private: + bool m_fuse_qk_l2norm = false; + float m_q_l2_norm_eps = 1e-6F; + float m_k_l2_norm_eps = 1e-6F; +}; + +} // namespace ov::op::internal diff --git a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp index 49f51b7ca3fc..380e70a72e07 100644 --- a/ggml/src/ggml-openvino/openvino/op/get_rows.cpp +++ b/ggml/src/ggml-openvino/openvino/op/get_rows.cpp @@ -18,16 +18,9 @@ namespace op { OutputVector translate_get_rows(const NodeContext & context) { num_inputs_check(context, 2, 2); - int op_case = context.get_op_case(); - Output res; - auto data = context.get_input(0); - auto indices = context.get_input(1); - - if (op_case == 2) { - // The input comes from a VIEW - indices = process_view_input(context, 1); - } + auto data = process_view_input_new(context, 0); + auto indices = process_view_input_new(context, 1); // data[1,b,x,y] ind[1,1,b,x'] test-backend-ops case // data[x,y] ind[1,1,1,x'] normal case diff --git a/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp b/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp index d9fa4c24367c..a54870d9d74f 100644 --- a/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp +++ b/ggml/src/ggml-openvino/openvino/op/glu_geglu.cpp @@ -4,6 +4,7 @@ #include #include +#include #include #include #include @@ -21,23 +22,26 @@ OutputVector translate_glu_geglu(const NodeContext & context) { ov::Output src0; ov::Output src1; if (context.get_input_size() == 2) { - src0 = context.get_input(0); - src1 = context.get_input(1); + // Inputs may be VIEW slices of a combined gate_up tensor (MoE experts): + // resolve them so each half has its real sliced shape, not the base tensor. + src0 = process_view_input_new(context, 0); + src1 = process_view_input_new(context, 1); } else { // GGML splits along ne[0] (OV last axis) using floor division: nc = ne[0] / 2. // Both halves are nc elements; if the dimension is odd, the last element is dropped. // Use Slice instead of Split to handle odd dimensions correctly. - auto combined = context.get_input(0); + // Resolve a VIEW input (e.g. non-contiguous slice) to its real shape first. + auto combined = process_view_input_new(context, 0); auto combined_shape = combined.get_partial_shape(); int64_t last_dim_val = combined_shape[combined_shape.rank().get_length() - 1].get_length(); int64_t nc = last_dim_val / 2; - auto axis = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1}); - auto step = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + auto axis = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1}); + auto step = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); auto start0 = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); - auto stop0 = ov::op::v0::Constant::create(ov::element::i64, {1}, {nc}); + auto stop0 = ov::op::v0::Constant::create(ov::element::i64, {1}, {nc}); auto start1 = ov::op::v0::Constant::create(ov::element::i64, {1}, {nc}); - auto stop1 = ov::op::v0::Constant::create(ov::element::i64, {1}, {2 * nc}); + auto stop1 = ov::op::v0::Constant::create(ov::element::i64, {1}, {2 * nc}); src0 = std::make_shared(combined, start0, stop0, step, axis); src1 = std::make_shared(combined, start1, stop1, step, axis); @@ -49,6 +53,16 @@ OutputVector translate_glu_geglu(const NodeContext & context) { std::swap(src0, src1); } + if (context.is_static()) { + // TODO: Temporary solution for NPU accuracy issue due to fp16 overflow + // To be removed once permanent solution is implemented + // Justification: + // For |x| > 5, GELU(x) ≈ max(x, 0) (behaves like ReLU) + // So Clamp(-10, 10) only affects values where GELU would return ≈ x anyway. + // The only loss: values > 10 get mapped to 10 instead of x. + // In practice, FFN intermediates rarely exceed 10 after GEGLU gating. + src0 = std::make_shared(src0, -10.0, 10.0); + } auto gelu = std::make_shared(src0); auto res = std::make_shared(gelu, src1); diff --git a/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp b/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp index 00ed7951a03d..d220f2f584a5 100644 --- a/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp +++ b/ggml/src/ggml-openvino/openvino/op/glu_swiglu.cpp @@ -3,8 +3,11 @@ #include "../utils.h" #include +#include #include #include +#include +#include #include #include #include @@ -15,29 +18,32 @@ namespace frontend { namespace ggml { namespace op { -OutputVector translate_glu_swiglu(const NodeContext & context) { +static std::pair, ov::Output> get_glu_inputs(const NodeContext & context) { num_inputs_check(context, 1, 2); ov::Output src0; ov::Output src1; if (context.get_input_size() == 2) { - src0 = context.get_input(0); - src1 = context.get_input(1); + // Inputs may be VIEW slices of a combined gate_up tensor (MoE experts): + // resolve them so each half has its real sliced shape, not the base tensor. + src0 = process_view_input_new(context, 0); + src1 = process_view_input_new(context, 1); } else { // GGML splits along ne[0] (OV last axis) using floor division: nc = ne[0] / 2. // Both halves are nc elements; if the dimension is odd, the last element is dropped. // Use Slice instead of Split to handle odd dimensions correctly. - auto combined = context.get_input(0); + // Resolve a VIEW input (e.g. non-contiguous slice) to its real shape first. + auto combined = process_view_input_new(context, 0); auto combined_shape = combined.get_partial_shape(); int64_t last_dim_val = combined_shape[combined_shape.rank().get_length() - 1].get_length(); int64_t nc = last_dim_val / 2; - auto axis = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1}); - auto step = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + auto axis = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1}); + auto step = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); auto start0 = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); - auto stop0 = ov::op::v0::Constant::create(ov::element::i64, {1}, {nc}); + auto stop0 = ov::op::v0::Constant::create(ov::element::i64, {1}, {nc}); auto start1 = ov::op::v0::Constant::create(ov::element::i64, {1}, {nc}); - auto stop1 = ov::op::v0::Constant::create(ov::element::i64, {1}, {2 * nc}); + auto stop1 = ov::op::v0::Constant::create(ov::element::i64, {1}, {2 * nc}); src0 = std::make_shared(combined, start0, stop0, step, axis); src1 = std::make_shared(combined, start1, stop1, step, axis); @@ -49,6 +55,12 @@ OutputVector translate_glu_swiglu(const NodeContext & context) { std::swap(src0, src1); } + return {src0, src1}; +} + +OutputVector translate_glu_swiglu(const NodeContext & context) { + auto [src0, src1] = get_glu_inputs(context); + auto sigmoid = std::make_shared(src0); auto silu = std::make_shared(src0, sigmoid); auto res = std::make_shared(silu, src1); @@ -56,6 +68,27 @@ OutputVector translate_glu_swiglu(const NodeContext & context) { return rename_outputs_with_suffix({res}, context.get_name()); } +OutputVector translate_glu_swiglu_oai(const NodeContext & context) { + auto [src0, src1] = get_glu_inputs(context); + + const int32_t * params = context.get_output_op_params(); + const float alpha = reinterpret_cast(params)[2]; + const float limit = reinterpret_cast(params)[3]; + + auto gate = std::make_shared(src0, -std::numeric_limits::infinity(), limit); + auto alpha_const = ov::op::v0::Constant::create(ov::element::f32, {}, {alpha}); + auto scaled_gate = std::make_shared(gate, alpha_const); + auto sigmoid = std::make_shared(scaled_gate); + auto out_glu = std::make_shared(gate, sigmoid); + + auto up = std::make_shared(src1, -limit, limit); + auto one = ov::op::v0::Constant::create(ov::element::f32, {}, {1.0f}); + auto up_plus_one = std::make_shared(up, one); + auto res = std::make_shared(out_glu, up_plus_one); + + return rename_outputs_with_suffix({res}, context.get_name()); +} + } // namespace op } // namespace ggml } // namespace frontend diff --git a/ggml/src/ggml-openvino/openvino/op/im2col.cpp b/ggml/src/ggml-openvino/openvino/op/im2col.cpp new file mode 100644 index 000000000000..856e97f79d86 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/im2col.cpp @@ -0,0 +1,120 @@ +#include "../node_context.h" +#include "../op_table.h" +#include "../utils.h" +#include "ggml-impl.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +OutputVector translate_im2col(const NodeContext & context) { + num_inputs_check(context, 2, 2); + const int32_t * params = context.get_output_op_params(); + int32_t s0 = params[0]; + int32_t s1 = params[1]; + int32_t p0 = params[2]; + int32_t p1 = params[3]; + int32_t d0 = params[4]; + int32_t d1 = params[5]; + bool is_2D = params[6] == 1; + ov::Output res; + + ov::Output image = context.get_input(1); + const ov::Shape kernel_shape = context.get_input(0).get_shape(); + + const size_t IC = is_2D ? kernel_shape[1] : kernel_shape[2]; + const size_t KH = is_2D ? kernel_shape[2] : 1; + const size_t KW = kernel_shape[3]; + + int32_t stride_w = s0; + int32_t stride_h = is_2D ? s1 : 1; + int32_t pad_w = p0; + int32_t pad_h = is_2D ? p1 : 0; + int32_t dil_w = d0; + int32_t dil_h = is_2D ? d1 : 1; + + if (!is_2D) { + // GGML input shape: [IW, IC, N, 1] + // OpenVINO input shape: [1, N, IC, IW] + // Reshape image to: [N, IC, 1, IW] + const ov::Shape image_shape = image.get_shape(); + const size_t N = image_shape[1]; + const size_t IW = image_shape[3]; + auto image_reshape_shape = ov::op::v0::Constant::create( + ov::element::i64, ov::Shape{4}, + std::vector{static_cast(N), static_cast(IC), 1, static_cast(IW)}); + image = std::make_shared(image, image_reshape_shape, false); + } + + const ov::Shape patch_sizes = {KH, KW}; + const ov::Strides strides = {static_cast(stride_h), static_cast(stride_w)}; + const ov::Shape rates = {static_cast(dil_h), static_cast(dil_w)}; + + auto pads_begin = + ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, std::vector{0, 0, pad_h, pad_w}); + auto pads_end = + ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, std::vector{0, 0, pad_h, pad_w}); + + auto pad = std::make_shared(image, pads_begin, pads_end, ov::op::PadMode::CONSTANT); + auto patches = + std::make_shared(pad, patch_sizes, strides, rates, ov::op::PadType::VALID); + + // [N, KH*KW*IC, OH, OW] → [N, OH, OW, KH*KW*IC] + auto perm1 = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, std::vector{0, 2, 3, 1}); + auto t1 = std::make_shared(patches, perm1); + + // [N, OH, OW, KH*KW*IC] → [N, OH, OW, KH*KW, IC] + const ov::Shape out_shape = t1->get_output_shape(0); + const size_t N = out_shape[0]; + const size_t OH = out_shape[1]; + const size_t OW = out_shape[2]; + auto reshape1_shape = ov::op::v0::Constant::create( + ov::element::i64, ov::Shape{5}, + std::vector{static_cast(N), static_cast(OH), static_cast(OW), + static_cast(KH * KW), static_cast(IC)}); + auto r1 = std::make_shared(t1, reshape1_shape, false); + + // [N, OH, OW, KH*KW, IC] → [N, OH, OW, IC, KH*KW] + auto perm2 = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{5}, std::vector{0, 1, 2, 4, 3}); + auto t2 = std::make_shared(r1, perm2); + + // flatten back to [N, OH, OW, IC*KH*KW] + auto r2_shape = ov::op::v0::Constant::create( + ov::element::i64, ov::Shape{4}, + std::vector{static_cast(N), static_cast(OH), static_cast(OW), + static_cast(IC * KH * KW)}); + res = std::make_shared(t2, r2_shape, false); + + if (!is_2D) { + // [N, 1, OW, IC * KW] -> [1, N, OW, IC * KW] + auto final_reshape_shape = ov::op::v0::Constant::create( + ov::element::i64, ov::Shape{4}, + std::vector{1, static_cast(N), static_cast(OW), static_cast(IC * KW)}); + res = std::make_shared(res, final_reshape_shape, false); + } + + auto output_type = context.get_output_type(); + if (res.get_element_type() != output_type) { + res = std::make_shared(res, output_type); + } + + return rename_outputs_with_suffix({res}, context.get_name()); +} + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/l2_norm.cpp b/ggml/src/ggml-openvino/openvino/op/l2_norm.cpp new file mode 100644 index 000000000000..4b8ed3b6c4a2 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/l2_norm.cpp @@ -0,0 +1,44 @@ +#include "../node_context.h" +#include "../op_table.h" +#include "../utils.h" + +#include +#include +#include +#include +#include +#include +#include + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +OutputVector translate_l2_norm(const NodeContext & context) { + num_inputs_check(context, 1, 1); + + auto input_node = process_view_input_new(context, 0); + + auto squared = std::make_shared(input_node, input_node); + + auto sum_squared = std::make_shared( + squared, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {-1}), true); + + auto l2_norm = std::make_shared(sum_squared); + + float eps; + memcpy(&eps, context.get_output_op_params(), sizeof(float)); + + auto eps_const = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{1}, {eps}); + auto clamped_norm = std::make_shared(l2_norm, eps_const); + + auto res = std::make_shared(input_node, clamped_norm); + + return rename_outputs_with_suffix({res}, context.get_name()); +} + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/mul_mat_id.cpp b/ggml/src/ggml-openvino/openvino/op/mul_mat_id.cpp new file mode 100644 index 000000000000..6df2784c2e45 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/mul_mat_id.cpp @@ -0,0 +1,226 @@ +#include "../node_context.h" +#include "../op_table.h" +#include "../utils.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +namespace { + +std::shared_ptr const_i64(const std::vector & values) { + return ov::op::v0::Constant::create(ov::element::i64, ov::Shape{values.size()}, values); +} + +ov::Output slice_axis(const ov::Output & input, int64_t axis, int64_t begin, int64_t end) { + return std::make_shared(input, const_i64({begin}), const_i64({end}), const_i64({1}), + const_i64({axis})); +} + +ov::Output translate_mul_mat_id_mxfp4_packed(const NodeContext & context, + ov::Output expert_weights, + ov::Output activations, + ov::Output ids) { + auto packed_shape = expert_weights.get_partial_shape().to_shape(); + FRONT_END_OP_CONVERSION_CHECK(packed_shape.size() == 5 && packed_shape[4] == 17, + "Expected packed MXFP4 expert weights with shape [1, n_expert, m, k_blocks, 17]"); + + const int64_t n_expert = static_cast(packed_shape[1]); + const int64_t rows = static_cast(packed_shape[2]); + const int64_t k_blocks = static_cast(packed_shape[3]); + const int64_t qk = 32; + const int64_t cols = k_blocks * qk; + + auto packed_shape_4d = const_i64({n_expert, rows, k_blocks, 17}); + expert_weights = std::make_shared(expert_weights, packed_shape_4d, false); + + auto activations_shape_4d = std::make_shared(activations, ov::element::i64); + auto ids_shape_4d = std::make_shared(ids, ov::element::i64); + auto activations_shape_3d = get_dimensions(activations_shape_4d, {1, 2, 3}); + auto ids_shape_2d = get_dimensions(ids_shape_4d, {2, 3}); + + activations = std::make_shared(activations, activations_shape_3d, false); + ids = std::make_shared(ids, ids_shape_2d, false); + if (ids.get_element_type() != ov::element::i32 && ids.get_element_type() != ov::element::i64) { + ids = std::make_shared(ids, ov::element::i32); + } + + auto gather_axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {0}); + + static const std::vector f4e2m1_lut = {0.0f, 0.5f, 1.0f, 1.5f, 2.0f, 3.0f, 4.0f, 6.0f, + -0.0f, -0.5f, -1.0f, -1.5f, -2.0f, -3.0f, -4.0f, -6.0f}; + std::vector e8m0_lut(256); + for (size_t i = 0; i < e8m0_lut.size(); ++i) { + uint32_t bits = static_cast(i) << 23; + memcpy(&e8m0_lut[i], &bits, sizeof(float)); + } + e8m0_lut[0] = std::numeric_limits::min() / 2.0f; + e8m0_lut[255] = std::numeric_limits::quiet_NaN(); + + auto f4_lut = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{f4e2m1_lut.size()}, f4e2m1_lut); + auto scale_lut = ov::op::v0::Constant::create(ov::element::f32, ov::Shape{e8m0_lut.size()}, e8m0_lut); + + auto selected_packed_weights = std::make_shared(expert_weights, ids, gather_axis); + auto scale_byte = slice_axis(selected_packed_weights, 4, 0, 1); + auto qs = slice_axis(selected_packed_weights, 4, 1, 17); + auto low = std::make_shared( + qs, ov::op::v0::Constant::create(ov::element::u8, ov::Shape{}, {0x0F}), ov::op::AutoBroadcastType::NUMPY); + auto high_shift = std::make_shared( + qs, ov::op::v0::Constant::create(ov::element::u8, ov::Shape{}, {4}), ov::op::AutoBroadcastType::NUMPY); + auto nibbles = std::make_shared(ov::OutputVector{low, high_shift}, 4); + auto nibble_indices = std::make_shared(nibbles, ov::element::i32); + auto weights_f32 = std::make_shared(f4_lut, nibble_indices, gather_axis); + + auto scale_indices = std::make_shared(scale_byte, ov::element::i32); + auto scales_f32 = std::make_shared(scale_lut, scale_indices, gather_axis); + ov::Output selected_weights = std::make_shared(weights_f32, scales_f32, + ov::op::AutoBroadcastType::NUMPY); + + auto ids_shape = std::make_shared(ids, ov::element::i64); + auto selected_weights_target_dims = std::make_shared( + ov::OutputVector{get_dimensions(ids_shape, {0, 1}), const_i64({rows, cols})}, 0); + selected_weights = std::make_shared(selected_weights, selected_weights_target_dims, false); + + auto activations_shape = std::make_shared(activations, ov::element::i64); + ov::Output acts_target_dims = std::make_shared( + ov::OutputVector{ + get_dimensions(activations_shape, {0}), + get_dimensions(ids_shape, {1}), + get_dimensions(activations_shape, {2}), + }, + 0); + ov::Output acts_broadcasted = + std::make_shared(activations, acts_target_dims, ov::op::BroadcastType::BIDIRECTIONAL); + + auto activations_expanded = std::make_shared(acts_broadcasted, const_i64({2})); + ov::Output result = + std::make_shared(activations_expanded, selected_weights, false, true); + + auto batch_dim = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + auto row_dim = ov::op::v0::Constant::create(ov::element::i64, {1}, {rows}); + auto result_target_dims = std::make_shared( + ov::OutputVector{batch_dim, get_dimensions(ids_shape, {0, 1}), row_dim}, 0); + result = std::make_shared(result, result_target_dims, false); + + const auto output_type = context.get_output_type(); + if (result.get_element_type() != output_type) { + result = std::make_shared(result, output_type); + } + return result; +} + +} // namespace + +OutputVector translate_mul_mat_id(const NodeContext & context) { + num_inputs_check(context, 3, 3); + + auto expert_weights = process_view_input_new(context, 0); + auto activations = process_view_input_new(context, 1); + auto ids = process_view_input_new(context, 2); + + if (expert_weights.get_element_type() == ov::element::u8 && expert_weights.get_partial_shape().rank().is_static() && + expert_weights.get_partial_shape().rank().get_length() == 5) { + return rename_outputs_with_suffix({translate_mul_mat_id_mxfp4_packed(context, expert_weights, activations, ids)}, + context.get_name()); + } + + // OpenVINO sees GGML tensors in reversed dimension order: + // weights: [1, n_expert, m, k] + // activations: [1, n_tokens, n_used_or_1, k] + // ids: [1, 1, n_tokens, n_used] + // Rebuild the logical ranks explicitly from the 4D inputs instead of relying + // on fixed squeeze axes: real graphs can arrive through VIEW/RESHAPE chains + // where singleton axes are still represented differently at this point. + auto expert_weights_shape_4d = std::make_shared(expert_weights, ov::element::i64); + auto activations_shape_4d = std::make_shared(activations, ov::element::i64); + auto ids_shape_4d = std::make_shared(ids, ov::element::i64); + + auto expert_weights_shape_3d = get_dimensions(expert_weights_shape_4d, {1, 2, 3}); + auto activations_shape_3d = get_dimensions(activations_shape_4d, {1, 2, 3}); + auto ids_shape_2d = get_dimensions(ids_shape_4d, {2, 3}); + + expert_weights = std::make_shared(expert_weights, expert_weights_shape_3d, false); + activations = std::make_shared(activations, activations_shape_3d, false); + ids = std::make_shared(ids, ids_shape_2d, false); + + if (ids.get_element_type() != ov::element::i32 && ids.get_element_type() != ov::element::i64) { + ids = std::make_shared(ids, ov::element::i32); + } + + auto gather_axis = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{}, {0}); + ov::Output selected_weights = std::make_shared(expert_weights, ids, gather_axis); + + const auto output_type = context.get_output_type(); + if (selected_weights.get_element_type() != ov::element::f32) { + selected_weights = std::make_shared(selected_weights, ov::element::f32); + } + if (activations.get_element_type() != ov::element::f32) { + activations = std::make_shared(activations, ov::element::f32); + } + + auto activations_shape = std::make_shared(activations, ov::element::i64); + auto ids_shape = std::make_shared(ids, ov::element::i64); + ov::Output acts_target_dims = std::make_shared( + ov::OutputVector{ + get_dimensions(activations_shape, {0}), + get_dimensions(ids_shape, {1}), + get_dimensions(activations_shape, {2}), + }, + 0); + ov::Output acts_broadcasted = + std::make_shared(activations, acts_target_dims, ov::op::BroadcastType::BIDIRECTIONAL); + + auto unsqueeze_axes = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); + auto activations_expanded = std::make_shared(acts_broadcasted, unsqueeze_axes); + + auto batch_dim = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + auto output_shape = context.get_output_shape(); + FRONT_END_OP_CONVERSION_CHECK(output_shape.rank().is_static() && output_shape.rank().get_length() == 4, + "Unexpected MUL_MAT_ID output rank"); + FRONT_END_OP_CONVERSION_CHECK(output_shape[3].is_static(), "Expected static row dimension for MUL_MAT_ID output"); + const auto row_dim_value = output_shape[3].get_length(); + auto row_dim = ov::op::v0::Constant::create(ov::element::i64, {1}, {row_dim_value}); + + ov::Output result = + std::make_shared(activations_expanded, selected_weights, false, true); + + auto result_target_dims = std::make_shared( + ov::OutputVector{ + batch_dim, + get_dimensions(ids_shape, {0, 1}), + row_dim, + }, + 0); + result = std::make_shared(result, result_target_dims, false); + + if (result.get_element_type() != output_type) { + result = std::make_shared(result, output_type); + } + + return rename_outputs_with_suffix({result}, context.get_name()); +} + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp index 38edec85ddf7..41d7c54ae6be 100644 --- a/ggml/src/ggml-openvino/openvino/op/mulmat.cpp +++ b/ggml/src/ggml-openvino/openvino/op/mulmat.cpp @@ -30,17 +30,16 @@ OutputVector translate_mulmat(const NodeContext & context) { int op_case = context.get_op_case(); ov::Output res; - ov::Output B = context.get_input(0); - ov::Output A = context.get_input(1); - - bool transpose_b = true; - if (op_case == 2) { - B = B.get_node_shared_ptr()->input_value(0); - transpose_b = false; - } else if (op_case == 3) { + ov::Output B; + ov::Output A; + if (op_case == 3) { B = process_view_input(context, 0); A = process_view_input(context, 1); + } else { + B = process_view_input_new(context, 0); + A = process_view_input_new(context, 1); } + if (A.get_element_type() != B.get_element_type()) { B = std::make_shared(context.get_input(0), context.get_input_type(1)); } @@ -55,6 +54,7 @@ OutputVector translate_mulmat(const NodeContext & context) { auto batch_small = A_batch_larger ? B_batch : A_batch; Output Z = A_batch_larger ? B : A; + auto Z_shape = A_batch_larger ? B_shape : A_shape; int64_t factor = batch_large / batch_small; if (factor > 1 && batch_small > 1) { auto batch_large_node = ov::op::v0::Constant::create(ov::element::i64, {1}, std::vector{batch_large}); @@ -67,7 +67,11 @@ OutputVector translate_mulmat(const NodeContext & context) { auto broadcast_shape = ov::op::v0::Constant::create( ov::element::i64, {5}, {(int64_t) 1, (int64_t) 1, factor, (int64_t) 1, (int64_t) 1}); auto new_Z_shape = ov::op::v0::Constant::create(ov::element::i64, {4}, - {(int64_t) 0, batch_large, (int64_t) -1, (int64_t) A_shape[3]}); + {(int64_t) 0, batch_large, (int64_t) -1, (int64_t) Z_shape[3]}); + if (op_case == 2) { + new_Z_shape = ov::op::v0::Constant::create(ov::element::i64, {4}, + {(int64_t) 0, batch_large, (int64_t) Z_shape[2], (int64_t) -1}); + } auto Z_broadcasted = std::make_shared(Z_unsqueezed, broadcast_shape, ov::op::BroadcastType::BIDIRECTIONAL); @@ -79,8 +83,14 @@ OutputVector translate_mulmat(const NodeContext & context) { A = Z; } + bool transpose_b = true; res = std::make_shared(A, B, false, transpose_b); + const auto output_type = context.get_output_type(); + if (res.get_element_type() != output_type) { + res = std::make_shared(res, output_type); + } + return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/op/norm.cpp b/ggml/src/ggml-openvino/openvino/op/norm.cpp new file mode 100644 index 000000000000..c8bedb6dbf59 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/norm.cpp @@ -0,0 +1,58 @@ +#include "../node_context.h" +#include "../op_table.h" +#include "../utils.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +OutputVector translate_norm(const NodeContext & context) { + num_inputs_check(context, 1, 1); + + auto input_node = process_view_input_new(context, 0); + + // Step 1: Calculate mean along the last dimension + // mean = reduce_mean(input, axis=-1, keepdims=true) + auto mean = std::make_shared( + input_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {-1}), true); + + // Step 2: Calculate (input - mean) + auto centered = std::make_shared(input_node, mean); + + // Step 3: Calculate squared differences (input - mean)^2 + auto squared = std::make_shared( + centered, ov::op::v0::Constant::create(ov::element::f32, ov::Shape{1}, {2.0f})); + + // Step 4: Calculate variance = mean((input - mean)^2) + auto variance = std::make_shared( + squared, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {-1}), true); + + // Step 5: Get epsilon from op_params + float eps; + memcpy(&eps, context.get_output_op_params(), sizeof(float)); + + // Step 6: Calculate std = sqrt(variance + eps) + auto std_dev = std::make_shared(std::make_shared( + variance, ov::op::v0::Constant::create(ov::element::f32, ov::Shape{1}, {eps}))); + + // Step 7: Normalize: output = (input - mean) / std + auto res = std::make_shared(centered, std_dev); + + return rename_outputs_with_suffix({res}, context.get_name()); +} + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/pad.cpp b/ggml/src/ggml-openvino/openvino/op/pad.cpp new file mode 100644 index 000000000000..492033d1b787 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/pad.cpp @@ -0,0 +1,95 @@ +#include "../op_table.h" +#include "../utils.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +namespace { + +ov::Output translate_circular_pad(ov::Output input, + const std::array & pads, + const ov::Shape & input_shape) { + ov::Output result = input; + + const std::array pads_begin = {pads[6], pads[4], pads[2], pads[0]}; + const std::array pads_end = {pads[7], pads[5], pads[3], pads[1]}; + + for (size_t axis = 0; axis < input_shape.size(); ++axis) { + const int64_t input_dim = static_cast(input_shape[axis]); + const int64_t pad_begin = pads_begin[axis]; + const int64_t pad_end = pads_end[axis]; + + if (pad_begin == 0 && pad_end == 0) { + continue; + } + + FRONT_END_CHECK_IMPLEMENTED(input_dim > 0, "Circular PAD requires static non-zero input dimensions"); + + std::vector indices(static_cast(input_dim + pad_begin + pad_end)); + for (int64_t index = 0; index < static_cast(indices.size()); ++index) { + int64_t wrapped = (index - pad_begin) % input_dim; + if (wrapped < 0) { + wrapped += input_dim; + } + indices[static_cast(index)] = wrapped; + } + + auto gather_indices = ov::op::v0::Constant::create(ov::element::i64, {indices.size()}, indices); + auto gather_axis = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {axis}); + result = std::make_shared(result, gather_indices, gather_axis); + } + + return result; +} + +} // namespace + +OutputVector translate_pad(const NodeContext & context) { + num_inputs_check(context, 1, 1); + + auto input = process_view_input_new(context, 0); + if (context.get_input_shape(0) == context.get_output_shape()) { + auto input_shape = std::make_shared(input); + auto res = std::make_shared(input, input_shape, false); + return rename_outputs_with_suffix({res}, context.get_name()); + } + + const int32_t * op_params = context.get_output_op_params(); + FRONT_END_CHECK_IMPLEMENTED(op_params != nullptr, "PAD requires output op params"); + + const std::array pads = {op_params[0], op_params[1], op_params[2], op_params[3], + op_params[4], op_params[5], op_params[6], op_params[7]}; + const bool circular = op_params[8] != 0; + + if (circular) { + auto res = translate_circular_pad(input, pads, context.get_input_shape(0).to_shape()); + return rename_outputs_with_suffix({res}, context.get_name()); + } + + const std::vector pads_begin = {pads[6], pads[4], pads[2], pads[0]}; + const std::vector pads_end = {pads[7], pads[5], pads[3], pads[1]}; + + auto pads_begin_node = ov::op::v0::Constant::create(ov::element::i64, {pads_begin.size()}, pads_begin); + auto pads_end_node = ov::op::v0::Constant::create(ov::element::i64, {pads_end.size()}, pads_end); + auto pad_value = ov::op::v0::Constant::create(context.get_input_type(0), ov::Shape{}, {0}); + auto res = + std::make_shared(input, pads_begin_node, pads_end_node, pad_value, ov::op::PadMode::CONSTANT); + + return rename_outputs_with_suffix({res}, context.get_name()); +} + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/permute.cpp b/ggml/src/ggml-openvino/openvino/op/permute.cpp index 4c800f9ee4f6..85550bff396b 100644 --- a/ggml/src/ggml-openvino/openvino/op/permute.cpp +++ b/ggml/src/ggml-openvino/openvino/op/permute.cpp @@ -12,6 +12,7 @@ #include #include #include +#include namespace ov { namespace frontend { @@ -22,16 +23,33 @@ OutputVector translate_permute(const NodeContext & context) { num_inputs_check(context, 1, 1); int op_case = context.get_op_case(); - FRONT_END_CHECK_IMPLEMENTED(op_case == 1 || op_case == 2 || op_case == 3 || op_case == 4, - "Unsupported PERMUTE case"); + FRONT_END_CHECK_IMPLEMENTED(op_case != 0, "Unsupported PERMUTE case"); + // op_case 1 is trivial permute + // op_case 2 is to permute Q. It has a preceding VIEW that reshapes Q to restore the sequqence dimension + // op_case 3 4 it to permute KV cache in the default layout + // op_case 5 6 is to permute V cache when `-fa off`, where v_trans=true ov::Output res; - auto src = context.get_input(0); - auto perm = ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 2, 1, 3}); + ov::Output src; + if (op_case == 3 || op_case == 4 || op_case == 5 || op_case == 6) { + src = context.get_input(0); + } else { + src = process_view_input_new(context, 0); + } + std::vector perm_values{0, 2, 1, 3}; + const int32_t * op_params = context.get_output_op_params(); + if (op_params != nullptr) { + for (size_t input_axis = 0; input_axis < perm_values.size(); ++input_axis) { + const size_t output_axis = static_cast(op_params[input_axis]); + perm_values[perm_values.size() - 1 - output_axis] = + static_cast(perm_values.size() - 1 - input_axis); + } + } + auto perm = ov::op::v0::Constant::create(ov::element::i64, {4}, perm_values); if (op_case == 1 || context.is_stateful()) { res = std::make_shared(src, perm); - } else if (op_case == 4) { + } else if (op_case == 2) { auto output_shape = context.get_output_shape().to_shape(); auto n_heads = ov::op::v0::Constant::create(ov::element::i64, {1}, {output_shape[1]}); auto head_size = ov::op::v0::Constant::create(ov::element::i64, {1}, {output_shape[3]}); @@ -54,13 +72,17 @@ OutputVector translate_permute(const NodeContext & context) { auto output_shape = context.get_output_shape().to_shape(); int64_t head_size = output_shape[3]; int64_t n_heads = output_shape[1]; + if (op_case == 5 || op_case == 6) { + head_size = output_shape[2]; + n_heads = output_shape[1]; + } int64_t ctx_per_seq = cache_shape[2].is_static() ? cache_shape[2].get_length() : -1; int64_t n_seq = cache_shape[1].get_length(); Output attention_size; if (!context.has_input("attention_size")) { attention_size = ov::op::v0::Constant::create(ov::element::i64, {1}, {output_shape[2]}); - } else if (op_case == 2) { + } else if (op_case == 3 || op_case == 5) { attention_size = context.get_input("attention_size"); } else { attention_size = context.get_input("attention_size_swa"); @@ -80,18 +102,41 @@ OutputVector translate_permute(const NodeContext & context) { seq_active_end = ov::op::v0::Constant::create(ov::element::i64, {1}, {seq_active_end_val}); } - // 1. reshape to [n_seq, ctx_per_seq, n_heads, head_size] + // 1. reshape to [n_seq, ctx_per_seq, n_heads, head_size] (for `-fa off` [n_seq, n_heads, head_size, ctx_per_seq]) // 2. slice out the active sequences // 3. slice out the attention part in each sequence - // 4. permute + // 4. permute (skip for `-fa off`) auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); - auto src_reshaped = std::make_shared( - src, ov::op::v0::Constant::create(ov::element::i64, {4}, {n_seq, ctx_per_seq, n_heads, head_size}), false); - auto slice1 = std::make_shared(src_reshaped, seq_active_start, seq_active_end, one, zero); - auto slice2 = std::make_shared(slice1, zero, attention_size, one, one); - res = std::make_shared(slice2, perm); + if (op_case == 3 || op_case == 4) { + auto src_reshaped = std::make_shared( + src, ov::op::v0::Constant::create(ov::element::i64, {4}, {n_seq, ctx_per_seq, n_heads, head_size}), + false); + ov::Output after_seq_slice; + if (n_seq == 1) { + after_seq_slice = src_reshaped; + } else { + after_seq_slice = + std::make_shared(src_reshaped, seq_active_start, seq_active_end, one, zero); + } + auto slice2 = std::make_shared(after_seq_slice, zero, attention_size, one, one); + res = std::make_shared(slice2, perm); + } else { + auto three = ov::op::v0::Constant::create(ov::element::i64, {1}, {3}); + auto src_reshaped = std::make_shared( + src, ov::op::v0::Constant::create(ov::element::i64, {4}, {n_seq, n_heads, head_size, ctx_per_seq}), + false); + ov::Output after_seq_slice; + if (n_seq == 1) { + after_seq_slice = src_reshaped; + } else { + after_seq_slice = + std::make_shared(src_reshaped, seq_active_start, seq_active_end, one, zero); + } + auto slice2 = std::make_shared(after_seq_slice, zero, attention_size, one, three); + res = slice2; + } } return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/op/repeat.cpp b/ggml/src/ggml-openvino/openvino/op/repeat.cpp new file mode 100644 index 000000000000..4b742134b0cf --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/repeat.cpp @@ -0,0 +1,74 @@ +#include "../node_context.h" +#include "../op_table.h" +#include "../utils.h" +#include "ggml.h" + +#include +#include +#include +#include +#include +#include +#include + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +// GGML_OP_REPEAT tiles src[0] to fill the destination shape. Every destination +// dimension is an integer multiple of the corresponding source dimension. +OutputVector translate_repeat(const NodeContext & context) { + num_inputs_check(context, 1, 2); + + auto input = process_view_input_new(context, 0); + + const auto input_shape = context.get_input_shape(0); + const auto output_shape = context.get_output_shape(); + + if (input_shape.rank().is_static() && output_shape.rank().is_static() && + input_shape.rank() == output_shape.rank()) { + const auto rank = static_cast(input_shape.rank().get_length()); + std::vector repeats(rank, 1); + bool all_static = true; + + for (size_t axis = 0; axis < rank; ++axis) { + if (!input_shape[axis].is_static() || !output_shape[axis].is_static()) { + all_static = false; + break; + } + + const int64_t input_dim = input_shape[axis].get_length(); + const int64_t output_dim = output_shape[axis].get_length(); + + FRONT_END_OP_CONVERSION_CHECK(input_dim > 0 && output_dim > 0 && output_dim % input_dim == 0, + "REPEAT input shape ", input_shape, " cannot tile to match ", output_shape); + + repeats[axis] = output_dim / input_dim; + } + + if (all_static) { + auto repeats_node = ov::op::v0::Constant::create(ov::element::i64, {repeats.size()}, repeats); + ov::Output res = std::make_shared(input, repeats_node); + return rename_outputs_with_suffix({res}, context.get_name()); + } + } + + // Dynamic fallback: tile by the ratio of output to input shape. + auto input_shape_node = std::make_shared(input, ov::element::i64); + std::shared_ptr target_shape_node; + if (output_shape.rank().is_static() && output_shape.is_static()) { + target_shape_node = + ov::op::v0::Constant::create(ov::element::i64, {output_shape.to_shape().size()}, output_shape.to_shape()); + } else { + target_shape_node = std::make_shared(context.get_input(1), ov::element::i64); + } + auto repeats_node = std::make_shared(target_shape_node, input_shape_node); + ov::Output res = std::make_shared(input, repeats_node); + return rename_outputs_with_suffix({res}, context.get_name()); +} + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/reshape.cpp b/ggml/src/ggml-openvino/openvino/op/reshape.cpp index efd9a5a860ab..602d3387c9f9 100644 --- a/ggml/src/ggml-openvino/openvino/op/reshape.cpp +++ b/ggml/src/ggml-openvino/openvino/op/reshape.cpp @@ -10,7 +10,6 @@ #include #include #include -#include #include namespace ov { @@ -20,7 +19,8 @@ namespace op { OutputVector translate_reshape(const NodeContext & context) { num_inputs_check(context, 1, 1); - if (context.get_input_shape(0) == context.get_output_shape()) { + if (context.get_input(0).get_partial_shape().is_static() && + context.get_input_shape(0) == context.get_output_shape()) { return {context.get_input(0)}; } @@ -34,12 +34,12 @@ OutputVector translate_reshape(const NodeContext & context) { if (op_case == 1) { if (context.is_stateful()) { new_shape_node = ov::op::v0::Constant::create( - ov::element::i64, {3}, - std::vector{-1, (int64_t) output_shape[2], (int64_t) output_shape[3]}); + ov::element::i64, {3}, std::vector{-1, (int64_t) output_shape[2], (int64_t) output_shape[3]}); } else { new_shape_node = ov::op::v0::Constant::create( ov::element::i64, {4}, - std::vector{(int64_t) output_shape[0], -1, (int64_t) output_shape[2], (int64_t) output_shape[3]}); + std::vector{(int64_t) output_shape[0], -1, (int64_t) output_shape[2], + (int64_t) output_shape[3]}); } } else if (op_case == 2) { new_shape_node = ov::op::v0::Constant::create( @@ -47,7 +47,14 @@ OutputVector translate_reshape(const NodeContext & context) { std::vector{(int64_t) output_shape[0], (int64_t) output_shape[1], -1, (int64_t) output_shape[3]}); } else if (op_case == 3) { - throw std::runtime_error("might be outdated RESHAPE case"); + // - 14: [ 1, 1024, 1, 1] RESHAPE Vcur-0 (reshaped) (reshaped) + // [ 512, 2, 1, 1] 0: RESHAPE Vcur-0 (reshaped) + // - 15: [ 1, 524288, 1, 1] RESHAPE cache_v_l0 (reshaped) + // [ 512, 1024, 1, 1] 0: NONE cache_v_l0 + // - 16: [ 1, 524288, 1, 1] SET_ROWS cache_v_l0 (reshaped) (view) + // [ 1, 1024, 1, 1] 0: RESHAPE Vcur-0 (reshaped) (reshaped) + // [ 1024, 1, 1, 1] 1: NONE leaf_11 + // [ 1, 524288, 1, 1] 2: RESHAPE cache_v_l0 (reshaped) new_shape_node = ov::op::v0::Constant::create( ov::element::i64, {4}, std::vector{(int64_t) output_shape[0], (int64_t) output_shape[1], -1, 1}); diff --git a/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp b/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp index 72cf92283e9e..e76ec55b8aab 100644 --- a/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp +++ b/ggml/src/ggml-openvino/openvino/op/rms_norm.cpp @@ -19,7 +19,7 @@ namespace op { OutputVector translate_rms_norm(const NodeContext & context) { num_inputs_check(context, 1, 1); - auto input_node = context.get_input(0); + auto input_node = process_view_input_new(context, 0); auto square = std::make_shared( input_node, ov::op::v0::Constant::create(ov::element::f32, ov::Shape{1}, {2.0f})); diff --git a/ggml/src/ggml-openvino/openvino/op/rope.cpp b/ggml/src/ggml-openvino/openvino/op/rope.cpp index a8db9b38930f..9bb2d75d0a4c 100644 --- a/ggml/src/ggml-openvino/openvino/op/rope.cpp +++ b/ggml/src/ggml-openvino/openvino/op/rope.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -38,8 +39,7 @@ OutputVector translate_rope(const NodeContext & context) { auto data_node = context.get_input(0).get_node_shared_ptr(); auto output_shape = context.get_output_shape().to_shape(); int32_t * op_params = context.get_output_op_params(); - const int mode = (op_case & 0xFFFF0000) >> 16; - op_case = (op_case & 0x0000FFFF); + const int mode = op_case; constexpr int TYPE_NORMAL = 0; constexpr int TYPE_NEOX = 1; @@ -56,55 +56,146 @@ OutputVector translate_rope(const NodeContext & context) { if (context.get_input_size() == 3) { rope_freqs_weight = context.get_input(2).get_node_shared_ptr(); } - auto sin_cos = make_sin_cos(op_params, inp_pos, rope_freqs_weight, mode == TYPE_IMROPE); + auto sin_cos = make_sin_cos(op_params, inp_pos, rope_freqs_weight, mode == TYPE_IMROPE, false); sin_theta_node = sin_cos.first; cos_theta_node = sin_cos.second; } - if (op_case == 2) { - // The input comes from a VIEW - int slice_len = output_shape[2] * output_shape[3]; - data_node = process_view_input(context, 0, slice_len).get_node_shared_ptr(); + if (context.get_view_input_size(0) > 0) { + data_node = process_view_input_new(context, 0).get_node_shared_ptr(); if (context.is_stateful()) { auto data_shape = ov::op::v0::Constant::create( ov::element::i64, {3}, std::vector{-1, (int64_t) output_shape[2], (int64_t) output_shape[3]}); data_node = std::make_shared(data_node, data_shape, false); } else { auto data_shape = ov::op::v0::Constant::create( - ov::element::i64, {4}, std::vector{1, -1, (int64_t) output_shape[2], (int64_t) output_shape[3]}); + ov::element::i64, {4}, + std::vector{1, -1, (int64_t) output_shape[2], (int64_t) output_shape[3]}); data_node = std::make_shared(data_node, data_shape, false); } } + auto output_type = context.get_output_type(); + if (data_node->get_element_type() != ov::element::f32) { + data_node = std::make_shared(data_node, ov::element::f32); + } + + // TODO(openvino-gpu-rope-fusion): TEMPORARY WORKAROUND - do NOT revert until the + // OpenVINO GPU plugin is updated. + // + // For TYPE_NORMAL rope (both stateful and stateless) we emit the Flux-style + // interleaved pattern below so the GPU plugin's RoPEFusionFlux matcher folds it + // into ov::op::internal::RoPE. The matcher requires rank-4 inputs, which is why + // the original even/odd Slice translation (kept in the `else if (mode == + // TYPE_NORMAL)` branch below for reference) does not get fused. + // + // Once the GPU plugin's RoPE fusion is extended to also recognize the original + // even/odd Slice form, this Flux rewrite should be removed and both modes should + // be restored to the captured even/odd translation. Until then, keep both paths: + // the active Flux rewrite here and the previous translation preserved below. if (mode == TYPE_NORMAL) { - auto neg_one = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1}); - auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); - auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); - auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); - auto end = ov::op::v0::Constant::create(ov::element::i64, {1}, {output_shape[3]}); - Output even_slice; - Output odd_slice; - int32_t unsqueeze_dim = context.is_stateful() ? 3 : 4; - even_slice = std::make_shared(data_node, zero, end, two, neg_one); - odd_slice = std::make_shared(data_node, one, end, two, neg_one); - - Output first_half = - std::make_shared(std::make_shared(even_slice, cos_theta_node), - std::make_shared(odd_slice, sin_theta_node)); - Output second_half = - std::make_shared(std::make_shared(even_slice, sin_theta_node), - std::make_shared(odd_slice, cos_theta_node)); - - first_half = std::make_shared(first_half, - ov::op::v0::Constant::create(ov::element::i64, {1}, {unsqueeze_dim})); - second_half = std::make_shared(second_half, - ov::op::v0::Constant::create(ov::element::i64, {1}, {unsqueeze_dim})); - auto stack = std::make_shared(OutputVector{first_half, second_half}, unsqueeze_dim); - - auto data_shape = ov::op::v0::Constant::create( - ov::element::i64, {4}, std::vector{1, -1, (int64_t) output_shape[2], (int64_t) output_shape[3]}); - res = std::make_shared(stack, data_shape, false); - } else if (mode == TYPE_NEOX) { + // Emit the Flux-style interleaved-RoPE pattern so the GPU plugin's + // RoPEFusionFlux matcher folds this subgraph into ov::op::internal::RoPE: + // x_paired = Reshape(x, [1, S, n_heads, head_size/2, 2]) + // x0, x1 = Split(x_paired, axis=-1, num_splits=2) + // x1_neg = x1 * -1 + // x_rotated = Reshape(Concat([x1_neg, x0], axis=-1), [1, S, n_heads, head_size]) + // y = x * t_cos + x_rotated * t_sin + // Mathematically equivalent to the even/odd Slice form below. + // + // RoPEFusionFlux requires rank_equals(4) on x, t_cos and t_sin. The cos/sin + // tables are already built rank-4 ([1, S, 1, head_size/2]) for both modes. In + // stateful mode the data arrives rank-3 ([S, n_heads, head_size]), so lift it + // to rank-4 ([1, S, n_heads, head_size]) here. Stateful RoPE already produced + // rank-4 output, so downstream attention is unaffected. + if (context.is_stateful()) { + auto r4_shape = ov::op::v0::Constant::create( + ov::element::i64, {4}, + std::vector{1, -1, (int64_t) output_shape[2], (int64_t) output_shape[3]}); + data_node = std::make_shared(data_node, r4_shape, false); + } + const int64_t head_size = static_cast(output_shape[3]); + const int64_t n_heads = static_cast(output_shape[2]); + const int64_t half = head_size / 2; + + auto neg_one_f = ov::op::v0::Constant::create(data_node->get_element_type(), ov::Shape{}, {-1.0f}); + + auto paired_shape = + ov::op::v0::Constant::create(ov::element::i64, {5}, std::vector{1, -1, n_heads, half, 2}); + auto x_paired = std::make_shared(data_node, paired_shape, false); + + auto split_axis = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {-1}); + auto data_split = std::make_shared(x_paired, split_axis, 2); + Output x0 = data_split->outputs()[0]; + Output x1 = data_split->outputs()[1]; + + auto x1_neg = std::make_shared(x1, neg_one_f); + auto x_rotated_paired = std::make_shared(ov::OutputVector{x1_neg, x0}, -1); + + auto flat_shape = + ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector{1, -1, n_heads, head_size}); + auto x_rotated = std::make_shared(x_rotated_paired, flat_shape, false); + + // Expand cos/sin from [..., head_size/2] to [..., head_size] by repeating each + // entry twice. Use special_zero on the final Reshape so the seq dim passes + // through dynamically. Final rank is 4 to satisfy the matcher's predicate. + auto expand_cos_sin = [&](Output cs) { + auto cs_unsq = + std::make_shared(cs, ov::op::v0::Constant::create(ov::element::i64, {1}, {-1})); + auto bcast_target = + ov::op::v0::Constant::create(ov::element::i64, {5}, std::vector{1, 1, 1, half, 2}); + auto bcast = + std::make_shared(cs_unsq, bcast_target, ov::op::BroadcastType::BIDIRECTIONAL); + auto flat = ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector{0, 0, 0, head_size}); + return std::make_shared(bcast, flat, true); + }; + Output cos_full = expand_cos_sin(cos_theta_node); + Output sin_full = expand_cos_sin(sin_theta_node); + + auto y1 = std::make_shared(data_node, cos_full); + auto y2 = std::make_shared(x_rotated, sin_full); + res = std::make_shared(y1, y2); + } + // PRESERVED PREVIOUS TRANSLATION - Re-enable this branch (and remove the Flux branch above) once + // the GPU plugin's RoPE fusion is updated to recognize the even/odd Slice form; + // see the TODO(openvino-gpu-rope-fusion) note above. Do not delete. + // + // Original even/odd Slice form. In stateless mode it ran on rank-4 data + // ([1, S, n_heads, head_size]); in stateful mode on rank-3 data + // ([S, n_heads, head_size]). Either way it does not match RoPEFusionFlux + // (which needs rank-4 x in the interleaved layout), so the RoPE stays as + // discrete elementwise ops. + // + // } else if (mode == TYPE_NORMAL) { + // auto neg_one = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1}); + // auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + // auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + // auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); + // auto end = ov::op::v0::Constant::create(ov::element::i64, {1}, {output_shape[3]}); + // Output even_slice; + // Output odd_slice; + // // stateful data is rank 3 (unsqueeze at axis 3), stateless is rank 4 (axis 4) + // int32_t unsqueeze_dim = context.is_stateful() ? 3 : 4; + // even_slice = std::make_shared(data_node, zero, end, two, neg_one); + // odd_slice = std::make_shared(data_node, one, end, two, neg_one); + // + // Output first_half = + // std::make_shared(std::make_shared(even_slice, cos_theta_node), + // std::make_shared(odd_slice, sin_theta_node)); + // Output second_half = + // std::make_shared(std::make_shared(even_slice, sin_theta_node), + // std::make_shared(odd_slice, cos_theta_node)); + // + // first_half = std::make_shared(first_half, + // ov::op::v0::Constant::create(ov::element::i64, {1}, {unsqueeze_dim})); + // second_half = std::make_shared(second_half, + // ov::op::v0::Constant::create(ov::element::i64, {1}, {unsqueeze_dim})); + // auto stack = std::make_shared(OutputVector{first_half, second_half}, unsqueeze_dim); + // + // auto data_shape = ov::op::v0::Constant::create( + // ov::element::i64, {4}, std::vector{1, -1, (int64_t) output_shape[2], (int64_t) output_shape[3]}); + // res = std::make_shared(stack, data_shape, false); + else if (mode == TYPE_NEOX) { auto data_split = std::make_shared( data_node, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {-1}), 2); Output slice_data_node_0 = data_split->outputs()[0]; @@ -120,8 +211,9 @@ OutputVector translate_rope(const NodeContext & context) { res = std::make_shared(ov::OutputVector{first_half_node, second_half_node}, -1); } else if (mode == TYPE_IMROPE) { - int64_t n_dims = data_node->get_shape()[3]; - auto cos_sin_shape = std::make_shared(ov::element::i64, ov::Shape{4}, std::vector{1,-1,1,(n_dims >> 1)}); + int64_t n_dims = data_node->get_output_partial_shape(0)[3].get_length(); + auto cos_sin_shape = std::make_shared(ov::element::i64, ov::Shape{4}, + std::vector{1, -1, 1, (n_dims >> 1)}); auto cos_reshaped = std::make_shared(cos_theta_node, cos_sin_shape, true); auto sin_reshaped = std::make_shared(sin_theta_node, cos_sin_shape, true); @@ -140,6 +232,10 @@ OutputVector translate_rope(const NodeContext & context) { res = std::make_shared(ov::OutputVector{sub, add}, 3); } + if (res.get_element_type() != output_type) { + res = std::make_shared(res, output_type); + } + return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp index 136e4265b429..18643371e329 100644 --- a/ggml/src/ggml-openvino/openvino/op/set_rows.cpp +++ b/ggml/src/ggml-openvino/openvino/op/set_rows.cpp @@ -28,20 +28,20 @@ namespace op { OutputVector translate_set_rows(const NodeContext & context) { num_inputs_check(context, 3, 3); - auto data = context.get_input(0); + auto data = process_view_input_new(context, 0); auto indices = context.get_input(1); auto dst = context.get_input(2); data = std::make_shared(data, context.get_output_type()); - auto dst_shape = context.get_output_shape().to_shape(); + auto row_size = context.get_input_shape(2)[3].get_length(); auto ind_squeezed = std::make_shared(indices, ov::op::v0::Constant::create(ov::element::i64, {3}, {0, 1, 2})); auto data_reshaped = std::make_shared( data, ov::op::v0::Constant::create(ov::element::i64, {4}, - {(int64_t) 1, (int64_t) 1, (int64_t) -1, (int64_t) dst_shape[3]}), + {(int64_t) 1, (int64_t) 1, (int64_t) -1, (int64_t) row_size}), false); auto axes = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {2}); diff --git a/ggml/src/ggml-openvino/openvino/op/softmax.cpp b/ggml/src/ggml-openvino/openvino/op/softmax.cpp index 9f6330862be4..b391d3f91075 100644 --- a/ggml/src/ggml-openvino/openvino/op/softmax.cpp +++ b/ggml/src/ggml-openvino/openvino/op/softmax.cpp @@ -2,17 +2,19 @@ #include "../op_table.h" #include "../utils.h" -#include +#include #include +#include #include -#include -#include +#include +#include #include #include #include #include -#include #include +#include +#include #include #include #include @@ -22,63 +24,138 @@ namespace frontend { namespace ggml { namespace op { -OutputVector translate_soft_max(const NodeContext & context) { - // TODO code is outdated - num_inputs_check(context, 1, 2); +static bool is_static_one(const ov::Dimension & dim) { + return dim.is_static() && dim.get_length() == 1; +} + +static bool same_static_dim(const ov::Dimension & lhs, const ov::Dimension & rhs) { + return lhs.is_static() && rhs.is_static() && lhs.get_length() == rhs.get_length(); +} + +static bool is_attention_sinks_input_shape(const ov::PartialShape & candidate, const ov::PartialShape & logits_shape) { + if (candidate.rank().is_dynamic() || logits_shape.rank().is_dynamic() || candidate.rank().get_length() != 4 || + logits_shape.rank().get_length() != 4) { + return false; + } + + return is_static_one(candidate[0]) && is_static_one(candidate[1]) && is_static_one(candidate[2]) && + same_static_dim(candidate[3], logits_shape[1]); +} - auto input_node = context.get_input(0).get_node_shared_ptr(); - ov::Output res; +// Reimplementation of GGML_OP_SOFT_MAX semantics for OpenVINO backend: +// 1) logits = src0 * scale +// 2) logits += mask (if provided) +// 3) append attention sinks as hidden logits (if provided) +// 4) softmax over the last dimension and remove the hidden sink column +OutputVector translate_soft_max(const NodeContext & context) { + num_inputs_check(context, 1, 3); float scale = 1.0f; float max_bias = 0.0f; - auto * op_params = context.get_output_op_params(); - memcpy(&scale, (float *) op_params + 0, sizeof(float)); - memcpy(&max_bias, (float *) op_params + 1, sizeof(float)); - auto src0_shape = context.get_input_shape(0).get_shape(); - const uint32_t h = src0_shape[2]; - const uint32_t n_head = src0_shape[0]; - const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head)); - - const float m0 = powf(2.0f, -(max_bias) / n_head_log2); - const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); - const float slope = - (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2 * (h - n_head_log2) + 1) : 1.0f; - - auto scale_node = std::make_shared(ov::element::f32, ov::Shape{}, std::vector{scale}); - auto scaled_input = std::make_shared(input_node, scale_node); - - if (context.get_input_size() < 2) { - res = std::make_shared(scaled_input, 2); - return rename_outputs_with_suffix({res}, context.get_name()); + memcpy(&scale, (float *) context.get_output_op_params() + 0, sizeof(float)); + memcpy(&max_bias, (float *) context.get_output_op_params() + 1, sizeof(float)); + + ov::Output logits = context.get_input(0); + const bool second_input_is_sinks = + context.get_input_size() == 2 && is_attention_sinks_input_shape(context.get_input_shape(1), context.get_output_shape()); + const bool has_mask = context.get_input_size() > 1 && !second_input_is_sinks; + const bool has_sinks = second_input_is_sinks || context.get_input_size() > 2; + const size_t sinks_input_idx = second_input_is_sinks ? 1 : 2; + + // Apply scale first: logits = src0 * scale + if (scale != 1.0f) { + auto scale_const = + std::make_shared(ov::element::f32, ov::Shape{}, std::vector{scale}); + logits = std::make_shared(logits, scale_const); } - ov::Output mask_node_sliced; - if (context.has_input("KQ_mask_sliced")) { - mask_node_sliced = context.get_input("KQ_mask_sliced"); - } else { - auto token_len = get_dimensions(input_node, {1}); - auto mask_node = context.get_input(1); + FRONT_END_CHECK_IMPLEMENTED(!(max_bias > 0.0f && !has_mask), + "OpenVINO softmax ALiBi path requires mask input"); + + // Optional mask add: logits += mask + // For max_bias > 0 (ALiBi), apply per-head slope to mask before adding. + if (has_mask) { + ov::Output mask = context.get_input(1); + + // For stateful + std::string mask_name = "KQ_mask_sliced"; + if (context.get_input_names()[1].find("swa") != std::string::npos) { + mask_name = "KQ_mask_swa_sliced"; + } + if (context.has_input(mask_name)) { + mask = context.get_input(mask_name); + } + + if (mask.get_element_type() != logits.get_element_type()) { + mask = std::make_shared(mask, logits.get_element_type()); + } + + if (max_bias > 0.0f) { + auto out_shape = context.get_output_shape().to_shape(); + FRONT_END_CHECK_IMPLEMENTED(out_shape.size() == 4, "OpenVINO softmax ALiBi path expects rank-4 tensor"); + + const uint32_t n_head = static_cast(out_shape[1]); + FRONT_END_CHECK_IMPLEMENTED(n_head > 0, "OpenVINO softmax ALiBi path expects n_head > 0"); + + const uint32_t n_head_log2 = 1u << static_cast(std::floor(std::log2(static_cast(n_head)))); + const float m0 = std::pow(2.0f, -(max_bias) / static_cast(n_head_log2)); + const float m1 = std::pow(2.0f, -(max_bias / 2.0f) / static_cast(n_head_log2)); + + std::vector slopes(n_head); + for (uint32_t h = 0; h < n_head; ++h) { + slopes[h] = h < n_head_log2 ? std::pow(m0, static_cast(h + 1)) : + std::pow(m1, static_cast(2 * (h - n_head_log2) + 1)); + } + + ov::Output slope_node = + std::make_shared(ov::element::f32, ov::Shape{n_head}, slopes); + if (slope_node.get_element_type() != mask.get_element_type()) { + slope_node = std::make_shared(slope_node, mask.get_element_type()); + } + + auto slope_shape = std::make_shared( + ov::element::i64, ov::Shape{4}, std::vector{1, static_cast(n_head), 1, 1}); + auto slope_4d = std::make_shared(slope_node, slope_shape, false); + mask = std::make_shared(mask, slope_4d); + } + + logits = std::make_shared(logits, mask); + } + + ov::Output softmax_input = logits; + if (has_sinks) { + ov::Output sinks = context.get_input(sinks_input_idx); + if (sinks.get_element_type() != logits.get_element_type()) { + sinks = std::make_shared(sinks, logits.get_element_type()); + } + + auto sink_shape = ov::op::v0::Constant::create(ov::element::i64, {4}, {1, -1, 1, 1}); + auto sinks_4d = std::make_shared(sinks, sink_shape, false); + + auto logits_shape = std::make_shared(logits, ov::element::i64); auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); - mask_node_sliced = std::make_shared(mask_node, zero, token_len, one, one); - } + auto three = ov::op::v0::Constant::create(ov::element::i64, {1}, {3}); + auto four = ov::op::v0::Constant::create(ov::element::i64, {1}, {4}); + auto shape_axis = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); - if (mask_node_sliced.get_element_type() != context.get_output_type()) { - mask_node_sliced = std::make_shared(mask_node_sliced, context.get_output_type()); - } + auto sink_prefix_shape = std::make_shared(logits_shape, zero, three, one, shape_axis); + auto sink_last_dim = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + auto sink_broadcast_shape = std::make_shared( + ov::OutputVector{sink_prefix_shape, sink_last_dim}, 0); + auto sink_column = std::make_shared(sinks_4d, sink_broadcast_shape, + ov::op::BroadcastType::BIDIRECTIONAL); + softmax_input = std::make_shared(ov::OutputVector{logits, sink_column}, 3); - Output slope_mask; - if (slope != 1.0f) { - auto slope_node = - std::make_shared(ov::element::f32, ov::Shape{}, std::vector{slope}); - slope_mask = std::make_shared(mask_node_sliced, slope_node); - throw std::runtime_error("Slope != 1.0f in softmax has not been tested, verify it before use."); - } - slope_mask = mask_node_sliced; + auto softmax_with_sink = std::make_shared(softmax_input, -1); + auto original_last_dim = std::make_shared(logits_shape, three, four, one, shape_axis); + auto res = std::make_shared(softmax_with_sink, zero, original_last_dim, one, three); - auto input_slope_mask_node = std::make_shared(scaled_input, slope_mask); + return rename_outputs_with_suffix({res}, context.get_name()); + } - res = std::make_shared(input_slope_mask_node, 2); + // Softmax along last dimension (equivalent to ggml softmax over ne[0]). + auto res = std::make_shared(softmax_input, -1); return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/op/ssm_conv.cpp b/ggml/src/ggml-openvino/openvino/op/ssm_conv.cpp new file mode 100644 index 000000000000..522308726a8d --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/ssm_conv.cpp @@ -0,0 +1,59 @@ +#include "../node_context.h" +#include "../op_table.h" +#include "../utils.h" + +#include +#include +#include +#include + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +OutputVector translate_ssm_conv(const NodeContext & context) { + num_inputs_check(context, 2, 2); + + auto sx = context.get_input(0); // conv state + input: OV shape [1, n_s, d_inner, ncs] + auto c = context.get_input(1); // conv1d weight: OV shape [1, 1, d_inner, d_conv] + + auto sx_shape = context.get_input_shape(0).to_shape(); // [1, n_s, d_inner, ncs] + auto c_shape = context.get_input_shape(1).to_shape(); // [1, 1, d_inner, d_conv] + + int64_t n_s = sx_shape[1]; + int64_t d_inner = sx_shape[2]; + int64_t ncs = sx_shape[3]; // d_conv - 1 + n_t + int64_t d_conv = c_shape[3]; + int64_t n_t = ncs - d_conv + 1; + + // Reshape sx from [1, n_s, d_inner, ncs] to [n_s, d_inner, ncs] for 1D GroupConvolution + auto sx_new_shape = ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{n_s, d_inner, ncs}); + auto sx_reshaped = std::make_shared(sx, sx_new_shape, false); + + // Reshape c from [1, 1, d_inner, d_conv] to [d_inner, 1, 1, d_conv] + // GroupConvolution filter: [groups, out_channels/groups, in_channels/groups, kernel_size] + auto c_new_shape = ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector{d_inner, 1, 1, d_conv}); + auto c_reshaped = std::make_shared(c, c_new_shape, false); + + // Depthwise 1D convolution: groups=d_inner, stride=1, no padding, no dilation + // Input: [n_s, d_inner, ncs], Filter: [d_inner, 1, 1, d_conv] + // Output: [n_s, d_inner, n_t] + auto conv = std::make_shared( + sx_reshaped, c_reshaped, ov::Strides{1}, ov::CoordinateDiff{0}, ov::CoordinateDiff{0}, ov::Strides{1}); + + // Transpose from [n_s, d_inner, n_t] to [n_s, n_t, d_inner] + auto perm = ov::op::v0::Constant::create(ov::element::i64, {3}, std::vector{0, 2, 1}); + auto transposed = std::make_shared(conv, perm); + + // Reshape to output shape [1, n_s, n_t, d_inner] + auto out_shape = ov::op::v0::Constant::create(ov::element::i64, {4}, std::vector{1, n_s, n_t, d_inner}); + auto res = std::make_shared(transposed, out_shape, false); + + return rename_outputs_with_suffix({res}, context.get_name()); +} + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/sum_rows.cpp b/ggml/src/ggml-openvino/openvino/op/sum_rows.cpp new file mode 100644 index 000000000000..d04e6443be95 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/sum_rows.cpp @@ -0,0 +1,27 @@ +#include "../node_context.h" +#include "../op_table.h" +#include "../utils.h" + +#include +#include +#include + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +OutputVector translate_sum_rows(const NodeContext & context) { + num_inputs_check(context, 1, 1); + + auto input = process_view_input_new(context, 0); + auto res = std::make_shared( + input, ov::op::v0::Constant::create(ov::element::i64, ov::Shape{1}, {-1}), true); + + return rename_outputs_with_suffix({res}, context.get_name()); +} + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/transpose.cpp b/ggml/src/ggml-openvino/openvino/op/transpose.cpp index 8e62e83c0d78..8d89ca556d68 100644 --- a/ggml/src/ggml-openvino/openvino/op/transpose.cpp +++ b/ggml/src/ggml-openvino/openvino/op/transpose.cpp @@ -12,8 +12,39 @@ namespace op { OutputVector translate_transpose(const NodeContext & context) { num_inputs_check(context, 1, 1); + // Compute permute order from input/output shape and stride information + // so it adapts to different input and output layouts. + auto input_shape = context.get_input_shape(0).to_shape(); + auto input_stride = context.get_input_stride(0); + auto output_shape = context.get_output_shape().to_shape(); + auto output_stride = context.get_output_stride(); + + // Compute permute order by matching output and input stride rankings. + // Build pairs. + std::vector> output_stride_dims; + std::vector> input_stride_dims; + + for (int i = 0; i < 4; ++i) { + output_stride_dims.push_back({output_stride[i], i}); + input_stride_dims.push_back({input_stride[i], i}); + } + + // Sort by stride in descending order. + std::sort(output_stride_dims.rbegin(), output_stride_dims.rend()); + std::sort(input_stride_dims.rbegin(), input_stride_dims.rend()); + + // Build permute order. + std::vector permute_order(4); + for (int i = 0; i < 4; ++i) { + int output_dim = output_stride_dims[i].second; + int input_dim = input_stride_dims[i].second; + permute_order[output_dim] = input_dim; + } + + auto input = process_view_input_new(context, 0); + auto res = std::make_shared( - context.get_input(0), ov::op::v0::Constant::create(ov::element::i64, {4}, {0, 1, 3, 2})); + input, ov::op::v0::Constant::create(ov::element::i64, {4}, permute_order)); return rename_outputs_with_suffix({res}, context.get_name()); } diff --git a/ggml/src/ggml-openvino/openvino/op/unary_gelu.cpp b/ggml/src/ggml-openvino/openvino/op/unary_gelu.cpp deleted file mode 100644 index d1e9efc33a55..000000000000 --- a/ggml/src/ggml-openvino/openvino/op/unary_gelu.cpp +++ /dev/null @@ -1,25 +0,0 @@ -#include "../node_context.h" -#include "../op_table.h" -#include "../utils.h" - -#include -#include - -namespace ov { -namespace frontend { -namespace ggml { -namespace op { - -OutputVector translate_unary_gelu(const NodeContext & context) { - num_inputs_check(context, 1, 1); - - auto input = context.get_input(0); - auto res = std::make_shared(input); - - return rename_outputs_with_suffix({res}, context.get_name()); -} - -} // namespace op -} // namespace ggml -} // namespace frontend -} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp b/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp index 037e0b94df1f..48ee0431ff76 100644 --- a/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp +++ b/ggml/src/ggml-openvino/openvino/op/unary_silu.cpp @@ -14,7 +14,7 @@ namespace op { OutputVector translate_unary_silu(const NodeContext & context) { num_inputs_check(context, 1, 1); - auto input = context.get_input(0); + auto input = process_view_input_new(context, 0); auto sigmoid = std::make_shared(input); auto res = std::make_shared(input, sigmoid); diff --git a/ggml/src/ggml-openvino/openvino/op/unary_softplus.cpp b/ggml/src/ggml-openvino/openvino/op/unary_softplus.cpp new file mode 100644 index 000000000000..756d9c33d736 --- /dev/null +++ b/ggml/src/ggml-openvino/openvino/op/unary_softplus.cpp @@ -0,0 +1,38 @@ +#include "../node_context.h" +#include "../op_table.h" +#include "../utils.h" + +#include +#include +#include +#include +#include +#include +#include + +namespace ov { +namespace frontend { +namespace ggml { +namespace op { + +OutputVector translate_unary_softplus(const NodeContext & context) { + num_inputs_check(context, 1, 1); + + auto input = process_view_input_new(context, 0); + const auto element_type = input.get_element_type(); + auto one = ov::op::v0::Constant::create(element_type, ov::Shape{}, {1.0f}); + + auto positive = std::make_shared(input); + auto abs = std::make_shared(input); + auto neg_abs = std::make_shared(abs); + auto exp_neg_abs = std::make_shared(neg_abs); + auto log_term = std::make_shared(std::make_shared(one, exp_neg_abs)); + auto res = std::make_shared(positive, log_term); + + return rename_outputs_with_suffix({res}, context.get_name()); +} + +} // namespace op +} // namespace ggml +} // namespace frontend +} // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/op/view.cpp b/ggml/src/ggml-openvino/openvino/op/view.cpp index 8528d2523367..28004dcd2d8d 100644 --- a/ggml/src/ggml-openvino/openvino/op/view.cpp +++ b/ggml/src/ggml-openvino/openvino/op/view.cpp @@ -1,6 +1,11 @@ #include "../op_table.h" #include "../utils.h" + +#include #include +#include +#include + namespace ov { namespace frontend { namespace ggml { @@ -9,42 +14,102 @@ namespace op { OutputVector translate_view(const NodeContext & context) { num_inputs_check(context, 1, 1); - if (context.get_op_case() == 2) { - auto dst_shape = context.get_output_shape().to_shape(); - return rename_outputs_with_suffix({process_view_input(context, 0, dst_shape[2] * dst_shape[3])}, - context.get_name()); + if (!context.is_static()) { + return {context.get_input(0)}; } - // op_case 3 - if (context.get_op_case() == 3) { - auto input = context.get_input(0); - auto input_ov_shape = input.get_partial_shape(); - auto input_llama_shape = context.get_input_shape(0).to_shape(); + auto input = context.get_input(0); + auto src_shape = context.get_input_shape(0); + auto dst_shape = context.get_output_shape(); + + if (src_shape.rank().is_dynamic() || dst_shape.rank().is_dynamic()) { + return {input}; + } - // if the input ov shape size is different from the input llama shape size, it means the input is already reshaped and we need to reshape it back to the original shape before slicing - if (input_ov_shape.size() != input_llama_shape.size()) { - input = std::make_shared(input, ov::op::v0::Constant::create(ov::element::i64, {input_llama_shape.size()}, input_llama_shape), false); + int64_t src_elems = 1, dst_elems = 1; + for (int64_t i = 0; i < src_shape.rank().get_length(); ++i) { + if (src_shape[i].is_dynamic()) { + return {input}; } + src_elems *= src_shape[i].get_length(); + } + for (int64_t i = 0; i < dst_shape.rank().get_length(); ++i) { + if (dst_shape[i].is_dynamic()) { + return {input}; + } + dst_elems *= dst_shape[i].get_length(); + } - auto dst_shape = context.get_output_shape().to_shape(); + if (dst_elems >= src_elems) { + return {input}; + } + + auto src_stride = context.get_input_stride(0); + auto dst_stride = context.get_output_stride(); + size_t view_offset = context.get_output_op_offset(); - // find the index of dst_shape that is different from input shape, and use that index to slice the input - int slice_dim = -1; - for (size_t i = 0; i < dst_shape.size(); ++i) { - if (dst_shape[i] != input_llama_shape[i]) { - slice_dim = i; + bool same_stride = (src_stride.size() == dst_stride.size()); + if (same_stride) { + for (size_t i = 0; i < src_stride.size(); ++i) { + if (src_stride[i] != dst_stride[i]) { + same_stride = false; break; } } + } - auto begin = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); - auto end = ov::op::v0::Constant::create(ov::element::i64, {1}, {dst_shape[slice_dim]}); - auto stride = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); - auto axes = ov::op::v0::Constant::create(ov::element::i64, {1}, {slice_dim}); - auto sliced = std::make_shared(input, begin, end, stride, axes); - return {sliced}; + if (!same_stride) { + return {input}; } - return {context.get_input(0)}; + + auto src_ov_shape = src_shape.to_shape(); + auto dst_ov_shape = dst_shape.to_shape(); + size_t ndims = src_ov_shape.size(); + if (dst_ov_shape.size() != ndims) { + return {input}; + } + + std::vector diff_dims; + for (size_t i = 0; i < ndims; ++i) { + if (src_ov_shape[i] != dst_ov_shape[i]) { + diff_dims.push_back(static_cast(i)); + } + } + + if (diff_dims.size() != 1) { + return {input}; + } + + int slice_dim = diff_dims[0]; + int64_t dim_size = static_cast(src_ov_shape[slice_dim]); + + size_t ov_stride_for_dim = 1; + for (size_t i = slice_dim + 1; i < ndims; ++i) { + ov_stride_for_dim *= src_ov_shape[i]; + } + size_t elem_size = src_stride.back(); + if (elem_size == 0) { + elem_size = 1; + } + + int64_t begin_val = 0; + if (ov_stride_for_dim > 0 && elem_size > 0) { + begin_val = static_cast((view_offset / elem_size) / ov_stride_for_dim); + } + int64_t end_val = begin_val + static_cast(dst_ov_shape[slice_dim]); + + if (begin_val < 0 || end_val > dim_size) { + return {input}; + } + + auto sliced = + std::make_shared(input, ov::op::v0::Constant::create(ov::element::i64, {1}, {begin_val}), + ov::op::v0::Constant::create(ov::element::i64, {1}, {end_val}), + ov::op::v0::Constant::create(ov::element::i64, {1}, {1}), + ov::op::v0::Constant::create(ov::element::i64, {1}, {slice_dim})); + + sliced->set_friendly_name(context.get_output_name()); + return {sliced->output(0)}; } } // namespace op diff --git a/ggml/src/ggml-openvino/openvino/op_table.cpp b/ggml/src/ggml-openvino/openvino/op_table.cpp index 1385539279cb..59fd26df8cd5 100644 --- a/ggml/src/ggml-openvino/openvino/op_table.cpp +++ b/ggml/src/ggml-openvino/openvino/op_table.cpp @@ -5,9 +5,11 @@ #include #include #include +#include #include #include #include +#include namespace ov { namespace frontend { @@ -16,29 +18,45 @@ namespace ggml { std::unordered_map get_supported_ops() { using namespace ov::op; return { - {"GGML_OP_ADD", op::translate_1to1_match_2_inputs }, - {"GGML_OP_ADD1", op::translate_1to1_match_2_inputs }, - {"GGML_OP_CONT", op::translate_cont }, - {"GGML_OP_DIV", op::translate_1to1_match_2_inputs }, - {"GGML_OP_GET_ROWS", op::translate_get_rows }, - {"GGML_OP_MUL", op::translate_1to1_match_2_inputs}, - {"GGML_OP_MUL_MAT", op::translate_mulmat }, - {"GGML_OP_PERMUTE", op::translate_permute }, - {"GGML_OP_RESHAPE", op::translate_reshape }, - {"GGML_OP_RMS_NORM", op::translate_rms_norm }, - {"GGML_OP_ROPE", op::translate_rope }, - {"GGML_OP_SCALE", op::translate_scale }, - {"GGML_OP_SOFT_MAX", op::translate_soft_max }, - {"GGML_OP_SUB", op::translate_1to1_match_2_inputs}, - {"GGML_OP_TRANSPOSE", op::translate_transpose }, - {"GGML_UNARY_OP_GELU", op::translate_unary_gelu }, - {"GGML_UNARY_OP_SILU", op::translate_unary_silu }, - {"GGML_OP_VIEW", op::translate_view }, - {"GGML_GLU_OP_SWIGLU", op::translate_glu_swiglu }, - {"GGML_GLU_OP_GEGLU", op::translate_glu_geglu }, - {"GGML_OP_SET_ROWS", op::translate_set_rows }, - {"GGML_OP_CPY", op::translate_cpy }, - {"GGML_OP_FLASH_ATTN_EXT", op::translate_flash_attn_ext }, + {"GGML_OP_ADD", op::translate_1to1_match_2_inputs }, + {"GGML_OP_ADD1", op::translate_1to1_match_2_inputs }, + {"GGML_OP_ADD_ID", op::translate_add_id }, + {"GGML_OP_CONCAT", op::translate_concat }, + {"GGML_OP_CONT", op::translate_cont }, + {"GGML_OP_DIV", op::translate_div }, + {"GGML_OP_GET_ROWS", op::translate_get_rows }, + {"GGML_OP_IM2COL", op::translate_im2col }, + {"GGML_OP_MUL", op::translate_1to1_match_2_inputs}, + {"GGML_OP_MUL_MAT", op::translate_mulmat }, + {"GGML_OP_MUL_MAT_ID", op::translate_mul_mat_id }, + {"GGML_OP_PERMUTE", op::translate_permute }, + {"GGML_OP_RESHAPE", op::translate_reshape }, + {"GGML_OP_RMS_NORM", op::translate_rms_norm }, + {"GGML_OP_NORM", op::translate_norm }, + {"GGML_OP_L2_NORM", op::translate_l2_norm }, + {"GGML_OP_SUM_ROWS", op::translate_sum_rows }, + {"GGML_OP_ROPE", op::translate_rope }, + {"GGML_OP_SCALE", op::translate_scale }, + {"GGML_OP_SOFT_MAX", op::translate_soft_max }, + {"GGML_OP_ARGSORT", op::translate_argsort }, + {"GGML_OP_SUB", op::translate_1to1_match_2_inputs}, + {"GGML_OP_TRANSPOSE", op::translate_transpose }, + {"GGML_UNARY_OP_GELU", op::translate_1to1_match_1_input }, + {"GGML_UNARY_OP_SILU", op::translate_unary_silu }, + {"GGML_UNARY_OP_SOFTPLUS", op::translate_unary_softplus }, + {"GGML_UNARY_OP_TANH", op::translate_1to1_match_1_input }, + {"GGML_OP_VIEW", op::translate_view }, + {"GGML_GLU_OP_SWIGLU", op::translate_glu_swiglu }, + {"GGML_GLU_OP_SWIGLU_OAI", op::translate_glu_swiglu_oai }, + {"GGML_GLU_OP_GEGLU", op::translate_glu_geglu }, + {"GGML_OP_SET_ROWS", op::translate_set_rows }, + {"GGML_OP_CPY", op::translate_cpy }, + {"GGML_OP_FLASH_ATTN_EXT", op::translate_flash_attn_ext }, + {"GGML_OP_CLAMP", op::translate_clamp }, + {"GGML_OP_PAD", op::translate_pad }, + {"GGML_OP_SSM_CONV", op::translate_ssm_conv }, + {"GGML_OP_GATED_DELTA_NET", op::translate_gated_delta_net }, + {"GGML_OP_REPEAT", op::translate_repeat }, }; } diff --git a/ggml/src/ggml-openvino/openvino/op_table.h b/ggml/src/ggml-openvino/openvino/op_table.h index f546796d2ee0..1d695fa12588 100644 --- a/ggml/src/ggml-openvino/openvino/op_table.h +++ b/ggml/src/ggml-openvino/openvino/op_table.h @@ -8,30 +8,43 @@ namespace ggml { namespace op { -#define GGML_OP_CONVERTER(op) OutputVector op(const NodeContext& context) +#define GGML_OP_CONVERTER(op) OutputVector op(const NodeContext & context) -GGML_OP_CONVERTER(translate_add); GGML_OP_CONVERTER(translate_cont); +GGML_OP_CONVERTER(translate_concat); +GGML_OP_CONVERTER(translate_add_id); +GGML_OP_CONVERTER(translate_div); GGML_OP_CONVERTER(translate_get_rows); -GGML_OP_CONVERTER(translate_mul); +GGML_OP_CONVERTER(translate_im2col); GGML_OP_CONVERTER(translate_mulmat); +GGML_OP_CONVERTER(translate_mul_mat_id); GGML_OP_CONVERTER(translate_permute); GGML_OP_CONVERTER(translate_reshape); GGML_OP_CONVERTER(translate_rms_norm); +GGML_OP_CONVERTER(translate_norm); +GGML_OP_CONVERTER(translate_l2_norm); +GGML_OP_CONVERTER(translate_sum_rows); GGML_OP_CONVERTER(translate_rope); GGML_OP_CONVERTER(translate_scale); GGML_OP_CONVERTER(translate_unary_silu); -GGML_OP_CONVERTER(translate_unary_gelu); +GGML_OP_CONVERTER(translate_unary_softplus); GGML_OP_CONVERTER(translate_soft_max); GGML_OP_CONVERTER(translate_transpose); GGML_OP_CONVERTER(translate_view); GGML_OP_CONVERTER(translate_glu_swiglu); +GGML_OP_CONVERTER(translate_glu_swiglu_oai); GGML_OP_CONVERTER(translate_glu_geglu); GGML_OP_CONVERTER(translate_set_rows); GGML_OP_CONVERTER(translate_cpy); +GGML_OP_CONVERTER(translate_argsort); GGML_OP_CONVERTER(translate_flash_attn_ext); +GGML_OP_CONVERTER(translate_clamp); +GGML_OP_CONVERTER(translate_pad); +GGML_OP_CONVERTER(translate_ssm_conv); +GGML_OP_CONVERTER(translate_gated_delta_net); +GGML_OP_CONVERTER(translate_repeat); -} // namespace op +} // namespace op std::unordered_map get_supported_ops(); diff --git a/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.h b/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.h index b95385611e88..c229e25fb203 100644 --- a/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.h +++ b/ggml/src/ggml-openvino/openvino/pass/mark_decompression_convert_constant_folding.h @@ -1,8 +1,8 @@ #pragma once #include "mark_decompression_convert_constant_folding.h" -#include "openvino/pass/matcher_pass.hpp" #include "openvino/core/visibility.hpp" +#include "openvino/pass/matcher_pass.hpp" #ifdef OPENVINO_STATIC_LIBRARY # define TRANSFORMATIONS_API diff --git a/ggml/src/ggml-openvino/openvino/translate_session.cpp b/ggml/src/ggml-openvino/openvino/translate_session.cpp index 0f68a1f50623..d00c438e2a1f 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.cpp +++ b/ggml/src/ggml-openvino/openvino/translate_session.cpp @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -77,49 +78,48 @@ ov::pass::MakeStateful::ParamResPairs get_kv_param_res_pairs( return pairs; } -void add_sliced_mask(TensorMap & tensor_map, GgmlDecoder & ggml_model_decoder) { - - auto create_sliced_mask = [&](const std::string & mask_name, const std::string & sliced_name, bool is_static) { +void add_sliced_mask_stateful(TensorMap & tensor_map) { + auto create_sliced_mask = [&](const std::string & mask_name, const std::string & sliced_name) { if ((tensor_map.find(mask_name) != tensor_map.end()) && (tensor_map.find("token_len_per_seq") != tensor_map.end())) { auto token_len_per_seq = tensor_map.at("token_len_per_seq").get_node_shared_ptr(); auto mask = tensor_map.at(mask_name).get_node_shared_ptr(); - std::shared_ptr mask_sliced; - if (is_static) { - mask_sliced = mask; - } else if (ggml_model_decoder.is_stateful()) { - auto zero_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {0,0}); - auto one_2d = ov::op::v0::Constant::create(ov::element::i64, {2}, {1,1}); - auto zero_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); - auto three_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {3}); - auto neg_one_1d = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1}); - auto axes = ov::op::v0::Constant::create(ov::element::i64, {2}, {-2,-1}); - auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr(); - auto gather_inp_pos = std::make_shared(inp_pos, neg_one_1d, three_1d); - auto reshaped_inp_pos = std::make_shared(gather_inp_pos, ov::op::v0::Constant::create(ov::element::i64, {1}, {1}), false); - auto inp_pos_incremented = std::make_shared(reshaped_inp_pos, ov::op::v0::Constant::create(ov::element::i32, ov::Shape{1}, {1})); - auto stop = std::make_shared(ov::OutputVector{token_len_per_seq, std::make_shared(inp_pos_incremented, token_len_per_seq)}, 0); - mask_sliced = - std::make_shared(mask, zero_2d, stop, one_2d, axes); - mask_sliced = std::make_shared(mask_sliced, ov::element::f16); - mask_sliced->set_friendly_name(sliced_name); - } else { - auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); - auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); - auto two = ov::op::v0::Constant::create(ov::element::i64, {1}, {2}); - mask_sliced = std::make_shared(mask, zero, token_len_per_seq, one, two); - mask_sliced = std::make_shared(mask_sliced, ov::element::f16); - mask_sliced->set_friendly_name(sliced_name); - } + std::shared_ptr mask_sliced = mask; + auto one = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + auto zero = ov::op::v0::Constant::create(ov::element::i64, {1}, {0}); + auto three = ov::op::v0::Constant::create(ov::element::i64, {1}, {3}); + auto neg_one = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1}); + + auto step = ov::op::v0::Constant::create(ov::element::i64, {1}, {1}); + auto axes = ov::op::v0::Constant::create(ov::element::i64, {1}, {-1}); + + auto inp_pos = tensor_map.at("inp_pos").get_node_shared_ptr(); + auto last_inp_pos = std::make_shared(inp_pos, neg_one, three); + auto last_inp_pos_1d = std::make_shared( + last_inp_pos, ov::op::v0::Constant::create(ov::element::i64, {1}, {1}), false); + auto last_inp_pos_cvt = std::make_shared(last_inp_pos_1d, ov::element::i64); + auto last_inp_pos_inc = std::make_shared(last_inp_pos_cvt, one); + + mask_sliced = std::make_shared(mask, zero, last_inp_pos_inc, step, axes); + mask_sliced = std::make_shared(mask_sliced, ov::element::f16); + mask_sliced->set_friendly_name(sliced_name); + tensor_map.insert({sliced_name, mask_sliced->output(0)}); } }; - create_sliced_mask("self_kq_mask", "KQ_mask_sliced", ggml_model_decoder.is_static()); - create_sliced_mask("self_kq_mask_swa", "KQ_mask_swa_sliced", ggml_model_decoder.is_static()); + create_sliced_mask("self_kq_mask", "KQ_mask_sliced"); + create_sliced_mask("self_kq_mask_swa", "KQ_mask_swa_sliced"); } void add_rope_sin_cos(TensorMap & tensor_map, GgmlDecoder & ggml_model_decoder) { + // When ROPE ops in the graph have divergent op_params (e.g. gemma4's mixed + // SWA/non-SWA layers with different n_dims or freq_base), a shared sin/cos + // precompute cannot broadcast across every ROPE use. Skip it here and let + // translate_rope() build sin/cos per-op from its own op_params. + if (ggml_model_decoder.has_mixed_rope_params()) { + return; + } int32_t * rope_params = ggml_model_decoder.get_rope_params(); if (tensor_map.find("inp_pos") == tensor_map.end() || rope_params == nullptr) { return; @@ -142,8 +142,11 @@ void add_rope_sin_cos(TensorMap & tensor_map, GgmlDecoder & ggml_model_decoder) // Create common patterns void preprocess(TensorMap & tensor_map, GgmlDecoder & ggml_model_decoder) { - add_sliced_mask(tensor_map, ggml_model_decoder); - add_rope_sin_cos(tensor_map, ggml_model_decoder); + if (ggml_model_decoder.is_stateful()) { + add_sliced_mask_stateful(tensor_map); + } + // This optimization is error-prone + // add_rope_sin_cos(tensor_map, ggml_model_decoder); } } // namespace @@ -288,19 +291,19 @@ std::shared_ptr TranslateSession::apply_transformations(std::shared_ptris_stateful()) { auto output_names = ggml_model_decoder->get_model_output_names(); std::map model_output_indexes; - for (size_t i=0; iget_output_size(); i++) { + for (size_t i = 0; i < model->get_output_size(); i++) { auto output_friendly_name = model->output(i).get_node_shared_ptr()->get_friendly_name(); auto output_id = model_output_indexes[output_friendly_name]; auto model_output_shape = model->output(i).get_partial_shape(); auto decoder_output_shape = ggml_model_decoder->get_output_shape(output_id); - if (model_output_shape.rank().is_static() && decoder_output_shape.rank().is_static() - && model_output_shape.rank().get_length() + 1 == decoder_output_shape.rank().get_length() - && decoder_output_shape[0].is_static() && decoder_output_shape[0].get_length() == 1) { - ppp.output(i).postprocess().custom([](const ov::Output& node) { + if (model_output_shape.rank().is_static() && decoder_output_shape.rank().is_static() && + model_output_shape.rank().get_length() + 1 == decoder_output_shape.rank().get_length() && + decoder_output_shape[0].is_static() && decoder_output_shape[0].get_length() == 1) { + ppp.output(i).postprocess().custom([](const ov::Output & node) { auto axes = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{1}, {0}); return std::make_shared(node, axes); }); diff --git a/ggml/src/ggml-openvino/openvino/translate_session.h b/ggml/src/ggml-openvino/openvino/translate_session.h index 56a14ae7c07d..675e63223a97 100644 --- a/ggml/src/ggml-openvino/openvino/translate_session.h +++ b/ggml/src/ggml-openvino/openvino/translate_session.h @@ -9,16 +9,17 @@ namespace ggml { class TranslateSession { public: - TranslateSession(const frontend::InputModel::Ptr& input_model, - const std::unordered_map& translator_map, bool naive = false); + TranslateSession(const frontend::InputModel::Ptr & input_model, + const std::unordered_map & translator_map, + bool naive = false); std::shared_ptr get_converted_model(); - std::shared_ptr translate_graph(const frontend::InputModel::Ptr& input_model); + std::shared_ptr translate_graph(const frontend::InputModel::Ptr & input_model); private: std::shared_ptr apply_transformations(std::shared_ptr model); const frontend::InputModel::Ptr m_input_model; - const std::unordered_map& m_translator_map; + const std::unordered_map & m_translator_map; std::shared_ptr m_ov_model; bool m_naive; }; diff --git a/ggml/src/ggml-openvino/openvino/utils.cpp b/ggml/src/ggml-openvino/openvino/utils.cpp index 0baaf88e17a7..4e4f5dd0492e 100644 --- a/ggml/src/ggml-openvino/openvino/utils.cpp +++ b/ggml/src/ggml-openvino/openvino/utils.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -123,7 +124,8 @@ std::pair, ov::Output> make_sin_cos(int32_t * rope_params bool imrope, bool stateful) { if (stateful) { - inp_pos = std::make_shared(inp_pos, ov::op::v0::Constant::create(ov::element::i64, {1}, {0})); + inp_pos = + std::make_shared(inp_pos, ov::op::v0::Constant::create(ov::element::i64, {1}, {0})); inp_pos = std::make_shared(inp_pos, ov::element::f32); auto pos_perm = std::make_shared(ov::element::i64, ov::Shape{3}, std::vector{2, 1, 0}); @@ -212,8 +214,9 @@ std::pair, ov::Output> make_sin_cos(int32_t * rope_params } auto one_minus_ramp = std::make_shared(one, ramp_mix); - theta = std::make_shared(std::make_shared(theta_interp, one_minus_ramp), - std::make_shared(theta_extrap, ramp_mix)); + theta = + std::make_shared(std::make_shared(theta_interp, one_minus_ramp), + std::make_shared(theta_extrap, ramp_mix)); mscale *= (1.0f + 0.1f * std::log(1.0f / freq_scale)); } } @@ -252,6 +255,548 @@ ov::Output process_view_input(const NodeContext & context, int input_i return sliced; } +ov::Output process_view_input_new(const NodeContext & context, int input_index) { + auto input = context.get_input(input_index); + + // Check if this input has view inputs + size_t view_input_size = context.get_view_input_size(input_index); + if (view_input_size == 0) { + // No view inputs, return the input as is + return input; + } + + // If translate_view already resolved this VIEW (produced a Slice), the input + // will already have the expected shape — skip re-slicing. + auto expected_ov_shape = context.get_view_input_ov_shape(input_index, 0); + auto actual_shape = input.get_partial_shape(); + if (expected_ov_shape.rank().is_static() && actual_shape.rank().is_static() && + expected_ov_shape.rank() == actual_shape.rank()) { + bool shapes_match = true; + for (int64_t i = 0; i < expected_ov_shape.rank().get_length(); ++i) { + if (!expected_ov_shape[i].is_static() || !actual_shape[i].is_static()) { + shapes_match = false; + break; + } + if (expected_ov_shape[i] != actual_shape[i]) { + shapes_match = false; + break; + } + } + if (shapes_match) { + return input; + } + } + + // In static mode, use Split instead of Slice for single-dimension reductions. + // This ensures NPUW's FOLD doesn't parametrize per-layer slice indices (which + // would introduce dynamic shapes). A shared Split node sits outside the repeated + // subgraph boundary; each layer receives one of its output ports. + if (context.is_static() && view_input_size == 1) { + auto view_stride_v = context.get_view_input_stride(input_index, 0); + auto view_src_stride_v = context.get_view_input_src_stride(input_index, 0); + auto view_ggml_shape = context.get_view_input_ggml_shape(input_index, 0); + auto view_src_ggml_shape = context.get_view_input_src_ggml_shape(input_index, 0); + auto view_offset = context.get_view_input_offset(input_index, 0); + auto view_src_offset = context.get_view_input_src_offset(input_index, 0); + + size_t ndims = view_ggml_shape.size(); + std::vector diff_dims; + if (view_src_ggml_shape.size() == ndims) { + for (size_t i = 0; i < ndims; ++i) { + if (view_ggml_shape[i] != view_src_ggml_shape[i]) { + diff_dims.push_back(static_cast(i)); + } + } + } + + if (diff_dims.size() == 1) { + int split_dim = diff_dims[0]; + int64_t num_splits = static_cast(view_src_ggml_shape[split_dim]); + int64_t chunk_size = static_cast(view_ggml_shape[split_dim]); + + // Only apply when slicing exactly 1 element from a multi-element dimension + if (chunk_size == 1 && num_splits > 1) { + // Check suffix strides match (dimensions after split_dim) + bool suffix_ok = view_stride_v.size() == view_src_stride_v.size(); + if (suffix_ok) { + for (size_t i = static_cast(split_dim) + 1; i < ndims; ++i) { + if (view_stride_v[i] != view_src_stride_v[i]) { + suffix_ok = false; + break; + } + } + } + + if (suffix_ok && view_src_stride_v[split_dim] > 0) { + size_t relative_offset = view_offset >= view_src_offset ? view_offset - view_src_offset : 0; + int64_t split_index = static_cast(relative_offset / view_src_stride_v[split_dim]); + + if (split_index >= 0 && split_index < num_splits) { + auto src_node = input.get_node_shared_ptr(); + std::string rt_key = "split_dim_" + std::to_string(split_dim); + auto & rt_info = src_node->get_rt_info(); + + if (rt_info.find(rt_key) == rt_info.end()) { + auto axis_const = + ov::op::v0::Constant::create(ov::element::i64, {}, {static_cast(split_dim)}); + auto split_node = + std::make_shared(input, axis_const, static_cast(num_splits)); + split_node->set_friendly_name(src_node->get_friendly_name() + "_split"); + rt_info[rt_key] = split_node; + } + + auto split_node = rt_info[rt_key].as>(); + return split_node->output(static_cast(split_index)); + } + } + } + } + } + + // Lambda function to process a single view operation + auto process_single_view = + [](ov::Output current, size_t view_offset, const std::vector & view_stride, + const ov::Shape & view_ggml_shape, const ov::PartialShape & view_ov_shape, const std::string & view_name, + size_t view_src_offset, const std::vector & view_src_stride, const ov::Shape & view_src_ggml_shape, + const ov::PartialShape & view_src_ov_shape, const std::string & view_src_name) -> ov::Output { + auto build_reshape_pattern = [](const ov::PartialShape & target_ov_shape, + const ov::Shape & target_ggml_shape) -> std::vector { + const size_t ndims = target_ggml_shape.size(); + std::vector reshape_pattern(ndims); + size_t dynamic_dims = 0; + + if (target_ov_shape.rank().is_static() && + target_ov_shape.rank().get_length() == static_cast(ndims)) { + for (size_t i = 0; i < ndims; ++i) { + if (target_ov_shape[i].is_static()) { + reshape_pattern[i] = target_ov_shape[i].get_length(); + } else { + reshape_pattern[i] = -1; + ++dynamic_dims; + } + } + } else { + dynamic_dims = 2; + } + + if (dynamic_dims > 1) { + for (size_t i = 0; i < ndims; ++i) { + reshape_pattern[i] = static_cast(target_ggml_shape[i]); + } + } + + return reshape_pattern; + }; + + auto build_prefix_tail_reshape_pattern = [](const ov::PartialShape & target_ov_shape, + const ov::Shape & target_ggml_shape, size_t prefix_dims, + int64_t tail_dim) -> std::vector { + std::vector reshape_pattern(prefix_dims + 1); + size_t dynamic_dims = 0; + + if (target_ov_shape.rank().is_static() && + target_ov_shape.rank().get_length() == static_cast(target_ggml_shape.size())) { + for (size_t i = 0; i < prefix_dims; ++i) { + if (target_ov_shape[i].is_static()) { + reshape_pattern[i] = target_ov_shape[i].get_length(); + } else { + reshape_pattern[i] = -1; + ++dynamic_dims; + } + } + } else { + dynamic_dims = 2; + } + + if (dynamic_dims > 1) { + for (size_t i = 0; i < prefix_dims; ++i) { + reshape_pattern[i] = static_cast(target_ggml_shape[i]); + } + } + + reshape_pattern[prefix_dims] = tail_dim; + return reshape_pattern; + }; + + bool same_stride = view_stride.size() == view_src_stride.size(); + if (same_stride) { + for (size_t i = 0; i < view_stride.size(); ++i) { + if (view_stride[i] != view_src_stride[i]) { + same_stride = false; + break; + } + } + } + + bool same_ggml_shape = view_ggml_shape.size() == view_src_ggml_shape.size(); + if (same_ggml_shape) { + for (size_t i = 0; i < view_ggml_shape.size(); ++i) { + if (view_ggml_shape[i] != view_src_ggml_shape[i]) { + same_ggml_shape = false; + break; + } + } + } + + if (same_stride && same_ggml_shape) { + return current; + } + + if (same_stride) { + const size_t relative_offset = view_offset >= view_src_offset ? view_offset - view_src_offset : 0; + const size_t ndims = view_stride.size(); + + std::vector diff_dims; + if (view_ggml_shape.size() == ndims && view_src_ggml_shape.size() == ndims) { + for (size_t i = 0; i < ndims; ++i) { + if (view_ggml_shape[i] != view_src_ggml_shape[i]) { + diff_dims.push_back(static_cast(i)); + } + } + } + + if (diff_dims.size() == 1) { + const int slice_dim = diff_dims[0]; + const int64_t dim_size = static_cast(view_src_ggml_shape[slice_dim]); + + if (view_stride[slice_dim] > 0 && relative_offset % view_stride[slice_dim] == 0) { + const int64_t begin_val = static_cast((relative_offset / view_stride[slice_dim]) % + static_cast(dim_size)); + const int64_t end_val = begin_val + static_cast(view_ggml_shape[slice_dim]); + + if (begin_val >= 0 && end_val <= dim_size) { + auto sliced = std::make_shared( + current, ov::op::v0::Constant::create(ov::element::i64, {1}, {begin_val}), + ov::op::v0::Constant::create(ov::element::i64, {1}, {end_val}), + ov::op::v0::Constant::create(ov::element::i64, {1}, {1}), + ov::op::v0::Constant::create(ov::element::i64, {1}, {slice_dim})); + + if (view_ov_shape.is_static()) { + auto reshaped = std::make_shared( + sliced, + ov::op::v0::Constant::create(ov::element::i64, {ndims}, view_ov_shape.to_shape()), + false); + reshaped->set_friendly_name(view_name); + return reshaped; + } + + sliced->set_friendly_name(view_name); + return sliced; + } + } + + int64_t tail_src_elems = 1; + int64_t tail_dst_elems = 1; + for (size_t i = slice_dim; i < ndims; ++i) { + tail_src_elems *= static_cast(view_src_ggml_shape[i]); + tail_dst_elems *= static_cast(view_ggml_shape[i]); + } + + const size_t elem_stride = view_stride[ndims - 1]; + int64_t tail_begin = 0; + if (elem_stride > 0) { + tail_begin = + static_cast((relative_offset / elem_stride) % static_cast(tail_src_elems)); + } + const int64_t tail_end = tail_begin + tail_dst_elems; + + if (tail_begin >= 0 && tail_end <= tail_src_elems) { + std::vector flat_shape; + for (int i = 0; i < slice_dim; ++i) { + flat_shape.push_back(static_cast(view_src_ggml_shape[i])); + } + flat_shape.push_back(tail_src_elems); + const size_t flat_ndims = flat_shape.size(); + + auto flat = std::make_shared( + current, ov::op::v0::Constant::create(ov::element::i64, {flat_ndims}, flat_shape), false); + + auto sliced = std::make_shared( + flat, ov::op::v0::Constant::create(ov::element::i64, {1}, {tail_begin}), + ov::op::v0::Constant::create(ov::element::i64, {1}, {tail_end}), + ov::op::v0::Constant::create(ov::element::i64, {1}, {1}), + ov::op::v0::Constant::create(ov::element::i64, {1}, {slice_dim})); + + if (view_ov_shape.is_static()) { + auto reshaped = std::make_shared( + sliced, ov::op::v0::Constant::create(ov::element::i64, {ndims}, view_ov_shape.to_shape()), + false); + reshaped->set_friendly_name(view_name); + return reshaped; + } + + sliced->set_friendly_name(view_name); + return sliced; + } + } + + std::vector begin(ndims, 0); + std::vector end(ndims, 0); + std::vector step(ndims, 1); + std::vector axes(ndims, 0); + + size_t remaining_offset = relative_offset; + for (size_t i = 0; i < ndims; ++i) { + axes[i] = static_cast(i); + if (view_stride[i] > 0) { + begin[i] = static_cast(remaining_offset / view_stride[i]); + remaining_offset %= view_stride[i]; + } + end[i] = begin[i] + static_cast(view_ggml_shape[i]); + } + + bool in_bounds = view_src_ggml_shape.size() == ndims && view_ggml_shape.size() == ndims; + if (in_bounds) { + for (size_t i = 0; i < ndims; ++i) { + if (end[i] > static_cast(view_src_ggml_shape[i])) { + in_bounds = false; + break; + } + } + } + + if (in_bounds && remaining_offset == 0) { + auto sliced = std::make_shared( + current, ov::op::v0::Constant::create(ov::element::i64, {ndims}, begin), + ov::op::v0::Constant::create(ov::element::i64, {ndims}, end), + ov::op::v0::Constant::create(ov::element::i64, {ndims}, step), + ov::op::v0::Constant::create(ov::element::i64, {ndims}, axes)); + + sliced->set_friendly_name(view_name); + return sliced; + } + } else { + bool same_rank = view_stride.size() == view_src_stride.size() && + view_ggml_shape.size() == view_src_ggml_shape.size() && + view_stride.size() == view_ggml_shape.size(); + const size_t relative_offset = view_offset >= view_src_offset ? view_offset - view_src_offset : 0; + + if (same_rank) { + const size_t ndims = view_ggml_shape.size(); + std::vector diff_dims; + for (size_t i = 0; i < ndims; ++i) { + if (view_ggml_shape[i] != view_src_ggml_shape[i]) { + diff_dims.push_back(static_cast(i)); + } + } + + if (diff_dims.size() == 1) { + const size_t slice_dim = static_cast(diff_dims[0]); + bool suffix_stride_match = true; + for (size_t i = slice_dim + 1; i < ndims; ++i) { + if (view_stride[i] != view_src_stride[i]) { + suffix_stride_match = false; + break; + } + } + + if (suffix_stride_match && view_src_stride[slice_dim] > 0 && + relative_offset % view_src_stride[slice_dim] == 0) { + const int64_t begin_val = static_cast(relative_offset / view_src_stride[slice_dim]); + const int64_t end_val = begin_val + static_cast(view_ggml_shape[slice_dim]); + const int64_t dim_size = static_cast(view_src_ggml_shape[slice_dim]); + + if (begin_val >= 0 && end_val <= dim_size) { + auto sliced = std::make_shared( + current, ov::op::v0::Constant::create(ov::element::i64, {1}, {begin_val}), + ov::op::v0::Constant::create(ov::element::i64, {1}, {end_val}), + ov::op::v0::Constant::create(ov::element::i64, {1}, {1}), + ov::op::v0::Constant::create(ov::element::i64, {1}, {static_cast(slice_dim)})); + sliced->set_friendly_name(view_name); + return sliced; + } + } + } + } + + size_t view_elems = 1; + size_t src_elems = 1; + if (same_rank) { + for (size_t i = 0; i < view_ggml_shape.size(); ++i) { + view_elems *= view_ggml_shape[i]; + src_elems *= view_src_ggml_shape[i]; + } + } + + bool same_num_elements = same_rank && view_elems == src_elems; + + if (same_rank && relative_offset == 0 && same_num_elements) { + auto reshape_pattern = build_reshape_pattern(view_ov_shape, view_ggml_shape); + + auto reshaped = std::make_shared( + current, ov::op::v0::Constant::create(ov::element::i64, {reshape_pattern.size()}, reshape_pattern), + false); + reshaped->set_friendly_name(view_name); + return reshaped; + } + + if (same_rank) { + const size_t ndims = view_ggml_shape.size(); + + // Match views that can be expressed as a regular strided slice over the + // already reconstructed source tensor, e.g. offset on one axis plus step > 1 + // on another axis. + bool is_regular_slice = view_src_ggml_shape.size() == ndims; + std::vector begin(ndims, 0); + std::vector end(ndims, 0); + std::vector step(ndims, 1); + std::vector axes(ndims, 0); + size_t remaining_offset = relative_offset; + + if (is_regular_slice) { + for (size_t i = 0; i < ndims; ++i) { + axes[i] = static_cast(i); + + if (view_src_stride[i] == 0 || view_stride[i] == 0 || + view_stride[i] % view_src_stride[i] != 0) { + is_regular_slice = false; + break; + } + + step[i] = static_cast(view_stride[i] / view_src_stride[i]); + if (step[i] <= 0) { + is_regular_slice = false; + break; + } + + begin[i] = static_cast(remaining_offset / view_src_stride[i]); + remaining_offset %= view_src_stride[i]; + + if (view_ggml_shape[i] == 0) { + end[i] = begin[i]; + continue; + } + + end[i] = begin[i] + step[i] * static_cast(view_ggml_shape[i] - 1) + 1; + + if (begin[i] < 0 || end[i] > static_cast(view_src_ggml_shape[i])) { + is_regular_slice = false; + break; + } + } + } + + if (is_regular_slice && remaining_offset == 0) { + auto sliced = std::make_shared( + current, ov::op::v0::Constant::create(ov::element::i64, {ndims}, begin), + ov::op::v0::Constant::create(ov::element::i64, {ndims}, end), + ov::op::v0::Constant::create(ov::element::i64, {ndims}, step), + ov::op::v0::Constant::create(ov::element::i64, {ndims}, axes)); + + sliced->set_friendly_name(view_name); + return sliced; + } + + const size_t elem_stride = view_src_stride.back(); + const bool aligned_offset = elem_stride > 0 && relative_offset % elem_stride == 0; + + if (aligned_offset) { + size_t suffix_start = 0; + size_t expected_stride = elem_stride; + for (int i = static_cast(ndims) - 1; i >= 0; --i) { + if (view_stride[i] != expected_stride) { + suffix_start = static_cast(i + 1); + break; + } + expected_stride *= view_ggml_shape[i]; + } + + size_t prefix_elems = 1; + size_t suffix_elems = 1; + for (size_t i = 0; i < suffix_start; ++i) { + prefix_elems *= view_ggml_shape[i]; + } + for (size_t i = suffix_start; i < ndims; ++i) { + suffix_elems *= view_ggml_shape[i]; + } + + if (prefix_elems > 0 && src_elems % prefix_elems == 0) { + const size_t src_tail_elems = src_elems / prefix_elems; + const int64_t tail_begin = static_cast(relative_offset / elem_stride); + const int64_t tail_end = tail_begin + static_cast(suffix_elems); + + if (tail_begin >= 0 && tail_end <= static_cast(src_tail_elems)) { + auto prefix_tail_pattern = build_prefix_tail_reshape_pattern( + view_ov_shape, view_ggml_shape, suffix_start, static_cast(src_tail_elems)); + + auto prefix_tail = std::make_shared( + current, + ov::op::v0::Constant::create(ov::element::i64, {prefix_tail_pattern.size()}, + prefix_tail_pattern), + false); + + ov::Output selected = prefix_tail; + if (tail_begin != 0 || tail_end != static_cast(src_tail_elems)) { + selected = std::make_shared( + prefix_tail, ov::op::v0::Constant::create(ov::element::i64, {1}, {tail_begin}), + ov::op::v0::Constant::create(ov::element::i64, {1}, {tail_end}), + ov::op::v0::Constant::create(ov::element::i64, {1}, {1}), + ov::op::v0::Constant::create(ov::element::i64, {1}, + {static_cast(suffix_start)})); + } + + auto reshape_pattern = build_reshape_pattern(view_ov_shape, view_ggml_shape); + auto reshaped = std::make_shared( + selected, + ov::op::v0::Constant::create(ov::element::i64, {reshape_pattern.size()}, + reshape_pattern), + false); + reshaped->set_friendly_name(view_name); + return reshaped; + } + } + } + } + + return current; + } + + (void) view_name; + (void) view_src_ov_shape; + (void) view_src_name; + + return current; + }; + + // Process views from the base tensor (last) to the current view (first) + // Start with the base tensor + ov::Output current = input; + + // Process each view in reverse order (from base to current) + for (int view_idx = view_input_size - 1; view_idx >= 0; view_idx--) { + auto view_offset = context.get_view_input_offset(input_index, view_idx); + auto view_stride = context.get_view_input_stride(input_index, view_idx); + auto view_ggml_shape = context.get_view_input_ggml_shape(input_index, view_idx); + auto view_ov_shape = context.get_view_input_ov_shape(input_index, view_idx); + auto view_name = context.get_view_input_name(input_index, view_idx); + + // print view info + // std::cout << "View " << view_idx << ": name = " << view_name << ", offset = " << view_offset << ", stride = [" + // << view_stride[0] << "," << view_stride[1] << "," << view_stride[2] << "," << view_stride[3] + // << "], ggml shape = [" << view_ggml_shape[0] << "," << view_ggml_shape[1] << "," + // << view_ggml_shape[2] << "," << view_ggml_shape[3] << "], ov shape = " << view_ov_shape << std::endl; + + auto view_src_offset = context.get_view_input_src_offset(input_index, view_idx); + auto view_src_stride = context.get_view_input_src_stride(input_index, view_idx); + auto view_src_ggml_shape = context.get_view_input_src_ggml_shape(input_index, view_idx); + auto view_src_ov_shape = context.get_view_input_src_ov_shape(input_index, view_idx); + auto view_src_name = context.get_view_input_src_name(input_index, view_idx); + // print source view info + // std::cout << "View " << view_idx << ": source name = " << view_src_name + // << ", source offset = " << view_src_offset << ", source stride = [" << view_src_stride[0] << "," + // << view_src_stride[1] << "," << view_src_stride[2] << "," << view_src_stride[3] + // << "], source ggml shape = [" << view_src_ggml_shape[0] << "," << view_src_ggml_shape[1] << "," + // << view_src_ggml_shape[2] << "," << view_src_ggml_shape[3] + // << "], source ov shape = " << view_src_ov_shape << std::endl; + + current = process_single_view(current, view_offset, view_stride, view_ggml_shape, view_ov_shape, view_name, + view_src_offset, view_src_stride, view_src_ggml_shape, view_src_ov_shape, + view_src_name); + } + + return current; +} + } // namespace ggml } // namespace frontend } // namespace ov diff --git a/ggml/src/ggml-openvino/openvino/utils.h b/ggml/src/ggml-openvino/openvino/utils.h index 767dd4c53ea5..8dc3e8765e82 100644 --- a/ggml/src/ggml-openvino/openvino/utils.h +++ b/ggml/src/ggml-openvino/openvino/utils.h @@ -1,13 +1,13 @@ #pragma once +#include "node_context.h" + #include #include #include #include #include -#include "node_context.h" - namespace ov { namespace frontend { namespace ggml { @@ -16,30 +16,23 @@ std::string getCurrentTime(); void dump_ov_model(std::shared_ptr model); -void num_inputs_check(const NodeContext& context, size_t min_inputs, size_t max_inputs); +void num_inputs_check(const NodeContext & context, size_t min_inputs, size_t max_inputs); int non_cont_dim(std::vector ne, std::vector nb); -template -std::vector argsort_descend(const std::vector& v) { +template std::vector argsort_descend(const std::vector & v) { std::vector idx(v.size()); std::iota(idx.begin(), idx.end(), 0); - std::sort(idx.begin(), idx.end(), [&v](int i1, int i2) { - return v[i1] > v[i2]; - }); + std::sort(idx.begin(), idx.end(), [&v](int i1, int i2) { return v[i1] > v[i2]; }); return idx; } -template -std::vector sorted_descend(std::vector v) { - std::sort(v.begin(), v.end(), [](T a, T b) { - return a > b; - }); +template std::vector sorted_descend(std::vector v) { + std::sort(v.begin(), v.end(), [](T a, T b) { return a > b; }); return v; } -template -bool is_permuted(const std::vector& strides) { +template bool is_permuted(const std::vector & strides) { for (size_t i = 0; i < strides.size() - 1; ++i) { if (strides[i] < strides[i + 1]) { return true; @@ -48,8 +41,7 @@ bool is_permuted(const std::vector& strides) { return false; } -template -std::vector permute(const std::vector& x, const std::vector& perm) { +template std::vector permute(const std::vector & x, const std::vector & perm) { std::vector result; result.reserve(perm.size()); for (int i : perm) { @@ -58,25 +50,35 @@ std::vector permute(const std::vector& x, const std::vector& perm) { return result; } -std::shared_ptr get_dimensions(const std::shared_ptr& shape, - const std::vector& dims); -std::shared_ptr get_dimensions(const std::shared_ptr& node, const std::vector& dims); +std::shared_ptr get_dimensions(const std::shared_ptr & shape, + const std::vector & dims); +std::shared_ptr get_dimensions(const std::shared_ptr & node, const std::vector & dims); -OutputVector rename_outputs_with_suffix(const OutputVector& outputs, const std::string& suffix); +OutputVector rename_outputs_with_suffix(const OutputVector & outputs, const std::string & suffix); -std::pair, ov::Output> make_sin_cos(int32_t* rope_params, +std::pair, ov::Output> make_sin_cos(int32_t * rope_params, std::shared_ptr inp_pos, std::shared_ptr rope_freqs_weight = nullptr, bool imrope = false, bool stateful = false); -ov::Output process_view_input(const NodeContext& context, int input_index, int slice_len = 0); +ov::Output process_view_input(const NodeContext & context, int input_index, int slice_len = 0); + +ov::Output process_view_input_new(const NodeContext & context, int input_index); namespace op { -template -OutputVector translate_1to1_match_2_inputs(const NodeContext& context) { +template OutputVector translate_1to1_match_2_inputs(const NodeContext & context) { num_inputs_check(context, 2, 2); - auto res = std::make_shared(context.get_input(0), context.get_input(1)); + auto input_0 = process_view_input_new(context, 0); + auto input_1 = process_view_input_new(context, 1); + auto res = std::make_shared(input_0, input_1); + return rename_outputs_with_suffix({res}, context.get_name()); +} + +template OutputVector translate_1to1_match_1_input(const NodeContext & context) { + num_inputs_check(context, 1, 1); + auto input = process_view_input_new(context, 0); + auto res = std::make_shared(input); return rename_outputs_with_suffix({res}, context.get_name()); } } // namespace op diff --git a/ggml/src/ggml-openvino/utils.cpp b/ggml/src/ggml-openvino/utils.cpp index 998ef7c9eb4f..70af08bdf182 100644 --- a/ggml/src/ggml-openvino/utils.cpp +++ b/ggml/src/ggml-openvino/utils.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -25,9 +26,11 @@ #include #include #include +#include #include #include #include +#include #include #include #include @@ -39,7 +42,7 @@ enum ggml_status ov_graph_compute(ggml_cgraph * cgraph, ggml_backend_t backend) { ggml_backend_openvino_context * ctx = (ggml_backend_openvino_context *) backend->context; try { - if (getenv("GGML_OPENVINO_DUMP_CGRAPH")) { + if (ggml_openvino_getenv_int("GGML_OPENVINO_DUMP_CGRAPH")) { std::string filename = "cgraph_ov.txt"; GgmlOvDecoder::dump_cgraph(cgraph, filename); } @@ -62,10 +65,92 @@ enum ggml_status ov_graph_compute(ggml_cgraph * cgraph, ggml_backend_t backend) } } +// For a KV cache input, return an ov::Tensor sized to n_kv (== attention_size +// for that layer) instead of the fully-allocated ctx_per_seq. Pre-conditions: +// * non-static (CPU/GPU) backend, single sequence, seq_active_start == 0 +// * ggml KV layout is a contiguous [1, 1, ctx_per_seq, n_heads_kv*head_size] +// so the first n_kv rows are the live prefix and shrinking the ctx axis +// gives a valid tensor over the same host storage +// * not an SWA layer (ring cache): once the window has wrapped the first +// n_kv rows no longer contain the live prefix +// On any unmet pre-condition returns std::nullopt; the caller falls back to +// the full-size tensor. +static std::optional try_make_kv_sliced_tensor(std::shared_ptr ggml_decoder, + const std::string & name, + const ggml_tensor * ggml_tensor) { + static const bool kv_slice_disabled = ggml_openvino_getenv_int("GGML_OPENVINO_DISABLE_KV_SLICE"); + if (kv_slice_disabled) { + return std::nullopt; + } + if (ggml_decoder->is_static() || ggml_decoder->is_stateful()) { + return std::nullopt; + } + if (ggml_tensor->op != GGML_OP_NONE || ggml_tensor->view_src != nullptr) { + return std::nullopt; + } + const auto * op = ggml_decoder->get_tensor_used_op(ggml_tensor); + if (!GgmlOvDecoder::is_kvcache(ggml_tensor, op)) { + return std::nullopt; + } + + const auto & compute_params = ggml_decoder->get_compute_params(); + if (compute_params.n_seq_active != 1 || compute_params.seq_active_start != 0) { + return std::nullopt; + } + + int layer; + if (auto layer_opt = extract_layer_from_name(name); layer_opt.has_value()) { + layer = layer_opt.value(); + } else { + return std::nullopt; + } + + const bool is_swa = ggml_decoder->is_swa_layer(layer); + if (is_swa) { + return std::nullopt; + } + const int ctx_per_seq = ggml_decoder->get_ctx_per_seq(); + const int n_kv = compute_params.attention_size; + if (ctx_per_seq <= 0 || n_kv <= 0 || n_kv >= ctx_per_seq) { + return std::nullopt; + } + + ov::Shape full_shape = ggml_decoder->get_shape(ggml_tensor); + if (full_shape.size() != 4 || full_shape[0] != 1 || full_shape[1] != 1 || + static_cast(full_shape[2]) != ctx_per_seq) { + return std::nullopt; + } + + ov::Shape sliced_shape = full_shape; + sliced_shape[2] = static_cast(n_kv); + + // Disabling for now as gpu has bug with in-place ScatterUpdate with remote tensors, can re-enable once CVS-186519 is fixed + // if (ggml_openvino_buffer_is_remote(ggml_tensor)) { + // auto remote_context = ggml_openvino_get_remote_context(); + // auto gpu_context = remote_context->as(); + // return gpu_context.create_tensor(ggml_decoder->get_ov_type(ggml_tensor), sliced_shape, ggml_tensor->data); + // } + + return ov::Tensor(ggml_decoder->get_ov_type(ggml_tensor), sliced_shape, ggml_tensor->data); +} + ov::Tensor create_ov_output_tensor(std::shared_ptr ggml_decoder, std::shared_ptr infer_request, int output_index, const ggml_tensor * ggml_tensor) { + if (auto sliced = try_make_kv_sliced_tensor(ggml_decoder, std::string(ggml_tensor->name), ggml_tensor)) { + return *sliced; + } + + // Disabling for now as gpu has bug with in-place ScatterUpdate with remote tensors, can re-enable once CVS-186519 is fixed + // if (ggml_tensor->extra != nullptr && !ggml_decoder->is_splited_model()) { + // auto * extra_base = static_cast(ggml_tensor->extra); + // if (extra_base->type == ggml_openvino_extra_base::Type::TENSOR) { + // auto * tensor_extra = static_cast(extra_base); + // return *tensor_extra->tensor; + // } + // } + auto output_type = ggml_decoder->get_ov_type(ggml_tensor); ov::Shape output_shape; if (ggml_decoder->is_static()) { @@ -86,7 +171,9 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr< static auto is_static = false; if (is_naive(cgraph)) { - return naive_compute(cgraph, core, device, config); + if (!is_model_splitted(cgraph)) { + return naive_compute(cgraph, core, device, config); + } } auto start_time = ggml_time_us(); @@ -98,18 +185,20 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr< std::tie(m_params, c_params) = GgmlOvDecoder::compute_llm_params(cgraph, is_static); graph_key key(cgraph); - bool cache_hit; + static const bool cache_enabled = !ggml_openvino_getenv_int("GGML_OPENVINO_DISABLE_CACHE"); + bool cache_hit = false; int64_t decoder_end_time; int64_t conversion_end_time; int64_t compile_end_time; int64_t infer_end_time; + int64_t ov_raw_infer_start; { std::shared_ptr entry; ModelParams old_m_params; - { + if (cache_enabled) { std::lock_guard map_lock(r_ctx->ctx_mutex); auto it = r_ctx->decoder_cache.find(key); cache_hit = it != r_ctx->decoder_cache.end(); @@ -120,6 +209,10 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr< entry = std::make_shared(mutex); r_ctx->decoder_cache[key] = entry; } + } else { + auto mutex = std::make_shared(); + entry = std::make_shared(mutex); + cache_hit = false; } std::lock_guard lock(*(entry->mutex)); @@ -127,9 +220,14 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr< if (cache_hit) { ggml_decoder = entry->ptr; old_m_params = ggml_decoder->get_model_params(); - cache_hit = old_m_params.can_reuse_dynamically(m_params); + if (!ggml_decoder->is_splited_model()) { + cache_hit = old_m_params.can_reuse_dynamically(m_params); + } } + std::vector ov_input_names; + std::vector ov_output_names; + if (cache_hit) { std::map> model_weights; ggml_decoder->set_compute_params(c_params); @@ -141,6 +239,8 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr< { std::lock_guard map_lock(r_ctx->ctx_mutex); infer_request = r_ctx->infer_request_cache.at(key); + ov_input_names = r_ctx->ov_input_names_cache.at(key); + ov_output_names = r_ctx->ov_output_names_cache.at(key); } if (stateful) { @@ -162,14 +262,15 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr< try { state_name = r_ctx->kv_state_input_name_map.at(state.get_name()); } catch (...) { - GGML_LOG_ERROR("GGML OpenVINO backend stateful inference failed: no input found for the state\n"); + GGML_LOG_ERROR( + "GGML OpenVINO backend stateful inference failed: no input found for the state\n"); return GGML_STATUS_FAILED; } auto kv_tensor = get_ov_input_tensor(ggml_decoder, state_name); - kv_tensor.set_shape({state_tensor_shape[0], kv_tensor.get_shape()[2], - state_tensor_shape[2], state_tensor_shape[3]}); - state_tensor = kv_tensor; - state_tensor_shape = state_tensor.get_shape(); + kv_tensor.set_shape({state_tensor_shape[0], kv_tensor.get_shape()[2], state_tensor_shape[2], + state_tensor_shape[3]}); + state_tensor = kv_tensor; + state_tensor_shape = state_tensor.get_shape(); } ov::Coordinate begin = {0, 0, 0, 0}; ov::Coordinate end = {state_tensor_shape[0], static_cast(pos_data[0]), @@ -177,7 +278,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr< ov::Tensor new_state_tensor(state_tensor, begin, end); state.set_state(new_state_tensor); } - r_ctx->stateful_kv_size = pos_data[0] + 1; + r_ctx->stateful_kv_size = pos_data[0] + pos_shape[3]; } } @@ -185,15 +286,17 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr< conversion_end_time = decoder_end_time; compile_end_time = decoder_end_time; } else { - { + if (cache_enabled) { std::lock_guard map_lock(r_ctx->ctx_mutex); r_ctx->infer_request_cache.erase(key); } + bool model_is_splitted = is_model_splitted(cgraph); std::shared_ptr model; auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph); - ggml_decoder = std::make_shared(cgraph, m_params, c_params, model_weights, is_static, stateful); + ggml_decoder = std::make_shared(cgraph, m_params, c_params, model_weights, is_static, + stateful, model_is_splitted); decoder_end_time = ggml_time_us(); auto input_model = std::make_shared(ggml_decoder); @@ -201,7 +304,7 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr< ggml_decoder->clear_model_weights(); conversion_end_time = ggml_time_us(); - if (getenv("GGML_OPENVINO_DUMP_IR")) { + if (ggml_openvino_getenv_int("GGML_OPENVINO_DUMP_IR")) { char timestamped_filename[64]; auto timestamp = (long long) ggml_time_us(); snprintf(timestamped_filename, sizeof(timestamped_filename), "model_%lld.xml", timestamp); @@ -219,8 +322,6 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr< infer_request = std::make_shared(compiled_model.create_infer_request()); entry->ptr = ggml_decoder; - std::vector ov_input_names; - std::vector ov_output_names; for (const auto & ov_param : model->get_parameters()) { ov_input_names.push_back(ov_param->get_friendly_name()); } @@ -228,66 +329,64 @@ enum ggml_status ov_graph_compute_dynamic(ggml_cgraph * cgraph, std::shared_ptr< ov_output_names.push_back(ov_output->get_friendly_name()); } - { + if (cache_enabled) { std::lock_guard map_lock(r_ctx->ctx_mutex); r_ctx->infer_request_cache[key] = infer_request; - r_ctx->ov_input_names_cache[key] = std::move(ov_input_names); - r_ctx->ov_output_names_cache[key] = std::move(ov_output_names); + r_ctx->ov_input_names_cache[key] = ov_input_names; + r_ctx->ov_output_names_cache[key] = ov_output_names; } - if (stateful) { + if (stateful && cache_enabled) { const auto * inp_pos = get_inp_pos_tensor(cgraph); auto pos_shape = ggml_decoder->get_shape(inp_pos); r_ctx->stateful_kv_size = pos_shape[3]; const auto kv_param_res_names = ggml_decoder->get_kv_param_res_names(); - for (const auto& pair : kv_param_res_names) { - r_ctx->kv_state_input_name_map[pair.first+pair.second] = pair.first; + for (const auto & pair : kv_param_res_names) { + r_ctx->kv_state_input_name_map[pair.first + pair.second] = pair.first; } } } - std::vector ov_input_names; - std::vector ov_output_names; - { - std::lock_guard map_lock(r_ctx->ctx_mutex); - ov_input_names = r_ctx->ov_input_names_cache[key]; - ov_output_names = r_ctx->ov_output_names_cache[key]; - } - for (size_t i = 0; i < ov_input_names.size(); i++) { auto param_name = ov_input_names[i]; auto input_tensor = get_ov_input_tensor(ggml_decoder, param_name); infer_request->set_input_tensor(i, input_tensor); - if (getenv("GGML_OPENVINO_DEBUG_INPUT")) { + if (ggml_openvino_getenv_int("GGML_OPENVINO_DEBUG_INPUT")) { print_input_tensor_info(param_name, input_tensor); } } for (size_t i = 0; i < ov_output_names.size(); i++) { auto * ggml_tensor = ggml_decoder->get_model_outputs().at(ov_output_names[i]); + if (ggml_nbytes(ggml_tensor) == 0) { + continue; + } auto output_tensor = create_ov_output_tensor(ggml_decoder, infer_request, i, ggml_tensor); infer_request->set_output_tensor(i, output_tensor); } + ov_raw_infer_start = ggml_time_us(); infer_request->infer(); infer_end_time = ggml_time_us(); - if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) { + if (ggml_openvino_getenv_int("GGML_OPENVINO_DEBUG_OUTPUT")) { for (size_t i = 0; i < ov_output_names.size(); i++) { const auto output_tensor = infer_request->get_output_tensor(i); print_output_tensor_info(ov_output_names[i], output_tensor, output_tensor.data()); } } - if (getenv("GGML_OPENVINO_PROFILING")) { + if (ggml_openvino_getenv_int("GGML_OPENVINO_PROFILING")) { GGML_LOG_INFO("\nGGML OpenVINO Backend: \n"); - GGML_LOG_INFO(" - Graph decoder time: %ld ms \n", (decoder_end_time - start_time) / 1000); + GGML_LOG_INFO(" - Graph decoder time: %.3f ms \n", (decoder_end_time - start_time) / 1000.0); if (!cache_hit) { - GGML_LOG_INFO(" - Graph conversion time: %ld ms \n", (conversion_end_time - decoder_end_time) / 1000); - GGML_LOG_INFO(" - Graph compile time: %ld ms \n", (compile_end_time - conversion_end_time) / 1000); + GGML_LOG_INFO(" - Graph conversion time: %.3f ms \n", + (conversion_end_time - decoder_end_time) / 1000.0); + GGML_LOG_INFO(" - Graph compile time: %.3f ms \n", (compile_end_time - conversion_end_time) / 1000.0); } - GGML_LOG_INFO(" - Graph inference time: %ld ms \n", (infer_end_time - compile_end_time) / 1000); + GGML_LOG_INFO(" - Graph inference time: %.3f ms \n", (infer_end_time - compile_end_time) / 1000.0); + GGML_LOG_INFO(" - OV raw infer time: %.3f ms \n", (infer_end_time - ov_raw_infer_start) / 1000.0); } } @@ -298,17 +397,18 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr 0) { - return atoi(chunk_size_str); - } - return 256; + static const int chunk_size = []() { + int env_prefill_chunk_size = ggml_openvino_getenv_int("GGML_OPENVINO_PREFILL_CHUNK_SIZE"); + return env_prefill_chunk_size > 0 ? env_prefill_chunk_size : 256; + }(); + return chunk_size; }; static std::string device = "NPU"; static auto is_static = true; static auto stateful = false; - static auto prefill_chunk_size = get_prefill_chunk_size(); + + auto prefill_chunk_size = get_prefill_chunk_size(); const auto & config = ggml_openvino_get_compile_config(); if (is_naive(cgraph)) { @@ -326,17 +426,20 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr entry; ModelParams old_m_params; - { + if (cache_enabled) { std::lock_guard map_lock(r_ctx->ctx_mutex); auto it = r_ctx->decoder_cache.find(key); cache_hit = it != r_ctx->decoder_cache.end(); @@ -347,6 +450,10 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr(mutex); r_ctx->decoder_cache[key] = entry; } + } else { + auto mutex = std::make_shared(); + entry = std::make_shared(mutex); + cache_hit = false; } std::lock_guard lock(*(entry->mutex)); @@ -357,6 +464,9 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr ov_input_names_local; + std::vector ov_output_names_local; + if (cache_hit) { std::map> model_weights; ggml_decoder->m_is_prefill = is_prefill; @@ -370,13 +480,15 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr map_lock(r_ctx->ctx_mutex); infer_request = is_prefill ? r_ctx->infer_request_cache_prefill.at(key) : r_ctx->infer_request_cache.at(key); + ov_input_names_local = r_ctx->ov_input_names_cache.at(key); + ov_output_names_local = r_ctx->ov_output_names_cache.at(key); } decoder_end_time = ggml_time_us(); conversion_end_time = decoder_end_time; compile_end_time = decoder_end_time; } else { - { + if (cache_enabled) { std::lock_guard map_lock(r_ctx->ctx_mutex); r_ctx->infer_request_cache.erase(key); r_ctx->infer_request_cache_prefill.erase(key); @@ -385,10 +497,14 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptr model; auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph); - auto ggml_decoder_prefill = std::make_shared(cgraph, m_params, c_params, model_weights, - is_static, stateful, true, prefill_chunk_size); + if (m_params.n_heads_kv == -1) { + // graph is not a LLM, e.g. context-shift graph + prefill_chunk_size = inp_pos->ne[0]; + } + auto ggml_decoder_prefill = std::make_shared( + cgraph, m_params, c_params, model_weights, is_static, stateful, false, true, prefill_chunk_size); auto ggml_decoder_decode = std::make_shared(cgraph, m_params, c_params, model_weights, is_static, - stateful, false, prefill_chunk_size); + stateful, false, false, prefill_chunk_size); decoder_end_time = ggml_time_us(); auto input_model_prefill = std::make_shared(ggml_decoder_prefill); @@ -400,7 +516,7 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptrclear_model_weights(); conversion_end_time = ggml_time_us(); - if (getenv("GGML_OPENVINO_DUMP_IR")) { + if (ggml_openvino_getenv_int("GGML_OPENVINO_DUMP_IR")) { char timestamped_filename[64]; auto timestamp = (long long) ggml_time_us(); snprintf(timestamped_filename, sizeof(timestamped_filename), "model_prefill_%lld.xml", timestamp); @@ -429,32 +545,22 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptrptr = ggml_decoder; - std::vector ov_input_names; - std::vector ov_output_names; for (const auto & ov_param : model->get_parameters()) { - ov_input_names.push_back(ov_param->get_friendly_name()); + ov_input_names_local.push_back(ov_param->get_friendly_name()); } for (const auto & ov_output : model->get_results()) { - ov_output_names.push_back(ov_output->get_friendly_name()); + ov_output_names_local.push_back(ov_output->get_friendly_name()); } - { + if (cache_enabled) { std::lock_guard map_lock(r_ctx->ctx_mutex); r_ctx->infer_request_cache_prefill[key] = infer_request_prefill; r_ctx->infer_request_cache[key] = infer_request_decode; - r_ctx->ov_input_names_cache[key] = std::move(ov_input_names); - r_ctx->ov_output_names_cache[key] = std::move(ov_output_names); + r_ctx->ov_input_names_cache[key] = ov_input_names_local; + r_ctx->ov_output_names_cache[key] = ov_output_names_local; } } - std::vector ov_input_names_local; - std::vector ov_output_names_local; - { - std::lock_guard map_lock(r_ctx->ctx_mutex); - ov_input_names_local = r_ctx->ov_input_names_cache[key]; - ov_output_names_local = r_ctx->ov_output_names_cache[key]; - } - if (is_prefill) { auto inp_len = inp_pos->ne[0]; for (int chunk_index = 0; chunk_index * prefill_chunk_size < inp_len; chunk_index++) { @@ -463,7 +569,7 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptrset_input_tensor(i, input_tensor); - if (getenv("GGML_OPENVINO_DEBUG_INPUT")) { + if (ggml_openvino_getenv_int("GGML_OPENVINO_DEBUG_INPUT")) { const auto input_tensor = infer_request->get_input_tensor(i); print_input_tensor_info(param_name, input_tensor); } @@ -475,9 +581,11 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptrset_output_tensor(i, output_tensor); } + ov_raw_infer_start = ggml_time_us(); infer_request->infer(); + ov_raw_infer_total += ggml_time_us() - ov_raw_infer_start; - if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) { + if (ggml_openvino_getenv_int("GGML_OPENVINO_DEBUG_OUTPUT")) { for (size_t i = 0; i < ov_output_names_local.size(); i++) { const auto output_tensor = infer_request->get_output_tensor(i); print_output_tensor_info(ov_output_names_local[i], output_tensor, output_tensor.data()); @@ -491,7 +599,7 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptrset_input_tensor(i, input_tensor); - if (getenv("GGML_OPENVINO_DEBUG_INPUT")) { + if (ggml_openvino_getenv_int("GGML_OPENVINO_DEBUG_INPUT")) { const auto input_tensor = infer_request->get_input_tensor(i); print_input_tensor_info(param_name, input_tensor); } @@ -503,10 +611,12 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptrset_output_tensor(i, output_tensor); } + ov_raw_infer_start = ggml_time_us(); infer_request->infer(); infer_end_time = ggml_time_us(); + ov_raw_infer_total = infer_end_time - ov_raw_infer_start; - if (getenv("GGML_OPENVINO_DEBUG_OUTPUT")) { + if (ggml_openvino_getenv_int("GGML_OPENVINO_DEBUG_OUTPUT")) { for (size_t i = 0; i < ov_output_names_local.size(); i++) { const auto output_tensor = infer_request->get_output_tensor(i); print_output_tensor_info(ov_output_names_local[i], output_tensor, output_tensor.data()); @@ -514,19 +624,75 @@ enum ggml_status ov_graph_compute_static(ggml_cgraph * cgraph, std::shared_ptrsrc. +// Step 2 verifies that node inputs come from model nodes/weights/leafs; external sources imply split. +bool is_model_splitted(ggml_cgraph * cgraph) { + // check the nodes of the model are used by the following nodes, through compare the node's use count and the count of nodes that use it as input. If does not match, return true, else return false. + for (int i = 0; i < cgraph->n_nodes; i++) { + ggml_tensor * node = cgraph->nodes[i]; + int use_count = cgraph->use_counts[ggml_hash_find(&cgraph->visited_hash_set, node)]; + // TODO: this is a workround for the tests case from llama.cpp, fix should from the root cause in the future. + if ((cgraph->n_nodes <= 1 && use_count == 0) || + (cgraph->n_nodes <= 1 && node->op == GGML_OP_VIEW && use_count == 1 && node->src[0] != nullptr && + node->src[0]->op == GGML_OP_NONE)) { + return false; + } + if (cgraph->n_nodes == 1 && + (cgraph->nodes[0]->op == GGML_OP_TRANSPOSE || cgraph->nodes[0]->op == GGML_OP_PERMUTE)) { + return false; + } + int input_use_count = 0; + for (int j = 0; j < cgraph->n_nodes; j++) { + ggml_tensor * other_node = cgraph->nodes[j]; + for (int k = 0; k < GGML_MAX_SRC; k++) { + if (other_node->src[k] == node) { + input_use_count++; + } + } + } + if (use_count != input_use_count && node->op != GGML_OP_NONE) { + return true; + } + } + // if all nodes's src node's src is not come from the nodes in the model, we think the model is splitted. This is a complementary check for the above check, because for some special case like the output node is not used by any node, the use count and input use count are both 0, we can not determine whether the model is splitted or not just based on the first check. + auto model_weights = GgmlOvDecoder::create_weight_nodes(cgraph, true); + std::set model_nodes(cgraph->nodes, cgraph->nodes + cgraph->n_nodes); + // leaf nodes + std::set model_leafs(cgraph->leafs, cgraph->leafs + cgraph->n_leafs); + for (int i = 0; i < cgraph->n_nodes; i++) { + ggml_tensor * node = cgraph->nodes[i]; + for (int j = 0; j < GGML_MAX_SRC; j++) { + ggml_tensor * src = node->src[j]; + // the src is also not the model weights, we think the model is splitted. + // the src is also not in model leafs, we think the model is splitted. + if (src != nullptr && model_nodes.find(src) == model_nodes.end() && + model_weights.find(std::string(src->name)) == model_weights.end() && !model_leafs.empty() == false && + model_leafs.find(src) == model_leafs.end()) { + if (GgmlOvDecoder::is_inp_tok(src, node)) { + return false; + } + return true; + } + } + } + return false; +} + bool is_naive(ggml_cgraph * cgraph) { constexpr int naive_graph_size_threshold = 20; int count = 0; @@ -551,7 +717,7 @@ enum ggml_status naive_compute(ggml_cgraph * cgraph, auto decoder = std::make_shared(cgraph, model_weights); auto input_model = std::make_shared(decoder); auto model = ov::frontend::ggml::FrontEnd::convert(input_model, naive); - if (getenv("GGML_OPENVINO_DUMP_IR")) { + if (ggml_openvino_getenv_int("GGML_OPENVINO_DUMP_IR")) { ov::serialize(model, "IR_naive.xml"); } @@ -578,40 +744,92 @@ enum ggml_status naive_compute(ggml_cgraph * cgraph, infer_request->set_input_tensor(i, input_tensor); } + // Use get_output_tensor + memcpy instead of set_output_tensor to avoid memory overwritten + // when i/o buffer overlaps, e.g. the cgraph is a single PERMUTE + + infer_request->infer(); + auto ov_results = model->get_results(); for (size_t i = 0; i < ov_results.size(); i++) { + auto output_tensor = infer_request->get_output_tensor(i); auto * ggml_tensor = decoder->get_model_outputs().at(ov_results[i]->get_friendly_name()); - auto output_tensor = create_ov_output_tensor(decoder, infer_request, i, ggml_tensor); - infer_request->set_output_tensor(i, output_tensor); + std::memcpy(ggml_tensor->data, output_tensor.data(), output_tensor.get_byte_size()); } - - infer_request->infer(); return GGML_STATUS_SUCCESS; } namespace { +template void set_zero_diagonal(std::vector & matrix, size_t rows, size_t cols, T zero_value = T{}) { + for (size_t i = 0; i < rows; ++i) { + size_t diag_col = std::min(i, cols - 1); + matrix[i * cols + diag_col] = zero_value; + } +} + +ov::Tensor make_contiguous_split_input_tensor(std::shared_ptr ggml_decoder, + const struct ggml_tensor * ggml_tensor, + const ov::Shape & input_shape) { + const size_t element_size = ggml_type_size(ggml_tensor->type); + const size_t block_size = ggml_blck_size(ggml_tensor->type); + + GGML_ASSERT(block_size == 1 && "non-contiguous split inputs must be plain element types"); + + const struct ggml_tensor * source_tensor = ggml_tensor->view_src != nullptr ? ggml_tensor->view_src : ggml_tensor; + const size_t source_offset = ggml_tensor->view_src != nullptr ? ggml_tensor->view_offs : 0; + + std::vector source_data(ggml_nbytes(source_tensor)); + ggml_backend_tensor_get(source_tensor, source_data.data(), 0, source_data.size()); + + ov::Tensor input_tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape); + auto * dst = static_cast(input_tensor.data()); + size_t dst_offset = 0; + + for (size_t i3 = 0; i3 < static_cast(ggml_tensor->ne[3]); ++i3) { + for (size_t i2 = 0; i2 < static_cast(ggml_tensor->ne[2]); ++i2) { + for (size_t i1 = 0; i1 < static_cast(ggml_tensor->ne[1]); ++i1) { + for (size_t i0 = 0; i0 < static_cast(ggml_tensor->ne[0]); ++i0) { + const size_t src_offset = source_offset + i3 * ggml_tensor->nb[3] + i2 * ggml_tensor->nb[2] + + i1 * ggml_tensor->nb[1] + i0 * ggml_tensor->nb[0]; + std::memcpy(dst + dst_offset, source_data.data() + src_offset, element_size); + dst_offset += element_size; + } + } + } + } + + return input_tensor; +} + ov::Tensor convert_ggml_input_to_ov(std::shared_ptr ggml_decoder, const std::string & name) { const auto * ggml_tensor = ggml_decoder->get_input_ggml_tensor(name); - if (ggml_tensor->extra != nullptr) { - // GGML_LOG_DEBUG("Using ggml_tensor->extra as ov::Tensor for input: %s\n", name.c_str()); + if (auto sliced = try_make_kv_sliced_tensor(ggml_decoder, name, ggml_tensor)) { + return *sliced; + } + + if (ggml_tensor->extra != nullptr && !ggml_decoder->is_splited_model()) { auto * extra_base = static_cast(ggml_tensor->extra); - if (extra_base->type != ggml_openvino_extra_base::Type::TENSOR) { - throw std::runtime_error("ggml tensor extra is not of type TENSOR for input: " + name); + if (extra_base->type == ggml_openvino_extra_base::Type::TENSOR) { + // GGML_LOG_DEBUG("Using ggml_tensor->extra as ov::Tensor for input: %s\n", name.c_str()); + auto * tensor_extra = static_cast(extra_base); + return *tensor_extra->tensor; } - auto * tensor_extra = static_cast(extra_base); - return *tensor_extra->tensor; } // GGML_LOG_DEBUG("Converting ggml tensor to ov::Tensor for input: %s\n", name.c_str()); auto * input_data = ggml_tensor->data; ov::Shape input_shape; - if (ggml_tensor->op == GGML_OP_VIEW) { + if (ggml_tensor->op == GGML_OP_VIEW && !ggml_decoder->is_splited_model()) { // This case is added to make test-backend-ops work input_shape = ggml_decoder->get_shape(ggml_tensor->view_src); } else { input_shape = ggml_decoder->get_shape(ggml_tensor); } + + if (ggml_decoder->is_splited_model() && !ggml_is_contiguous(ggml_tensor)) { + return make_contiguous_split_input_tensor(ggml_decoder, ggml_tensor, input_shape); + } + auto input_tensor = ov::Tensor(ggml_decoder->get_ov_type(ggml_tensor), input_shape, input_data); return input_tensor; } @@ -660,6 +878,14 @@ ov::Tensor get_ov_input_tensor_static_decode(std::shared_ptr ggml if (GgmlOvDecoder::is_inp_mask(ggml_tensor, op)) { size_t context_size = ggml_decoder->get_ctx_size(); + if (ggml_tensor->type == GGML_TYPE_F16) { + std::vector padded_data = + pad_input(ggml_tensor, 1, context_size, GGML_FP32_TO_FP16(-INFINITY)); + ov::Tensor input_tensor(ov::element::f16, ov::Shape{1, 1, 1, context_size}); + std::memcpy(input_tensor.data(), padded_data.data(), padded_data.size() * sizeof(ggml_fp16_t)); + return input_tensor; + } + std::vector padded_data = pad_input(ggml_tensor, 1, context_size, -INFINITY); ov::Tensor input_tensor(ov::element::f32, ov::Shape{1, 1, 1, context_size}); auto * data_ptr = input_tensor.data(); @@ -728,9 +954,20 @@ ov::Tensor get_ov_input_tensor_static_prefill(std::shared_ptr ggm if (GgmlOvDecoder::is_inp_mask(ggml_tensor, op)) { size_t cols = ggml_tensor->ne[0]; size_t rows = ggml_tensor->ne[1]; - float * ggml_data = (float *) ggml_tensor->data + chunk_index * chunk_size * cols; size_t chunk_valid_rows = std::min(chunk_size, rows - chunk_index * chunk_size); size_t context_size = ggml_decoder->get_ctx_size(); + if (ggml_tensor->type == GGML_TYPE_F16) { + const auto * ggml_data = + static_cast(ggml_tensor->data) + chunk_index * chunk_size * cols; + std::vector padded_data = pad_input(ggml_data, chunk_valid_rows, cols, chunk_size, + context_size, GGML_FP32_TO_FP16(-INFINITY)); + set_zero_diagonal(padded_data, chunk_size, context_size, GGML_FP32_TO_FP16(0.0f)); + ov::Tensor input_tensor(ov::element::f16, ov::Shape{1, 1, chunk_size, context_size}); + std::memcpy(input_tensor.data(), padded_data.data(), padded_data.size() * sizeof(ggml_fp16_t)); + return input_tensor; + } + + const auto * ggml_data = static_cast(ggml_tensor->data) + chunk_index * chunk_size * cols; std::vector padded_data = pad_input(ggml_data, chunk_valid_rows, cols, chunk_size, context_size, -INFINITY); set_zero_diagonal(padded_data, chunk_size, context_size); @@ -753,6 +990,65 @@ size_t checksum(const void * data, size_t size) { return sum; } +bool save_ggml_tensor_data_to_txt(const ggml_tensor * tensor, const std::string & file_path) { + if (tensor == nullptr || tensor->data == nullptr) { + return false; + } + + std::ofstream out(file_path); + if (!out.is_open()) { + return false; + } + + const size_t n = ggml_nelements(tensor); + out << "name: " << tensor->name << ", type: " << ggml_type_name(tensor->type) << ", shape: [" << tensor->ne[0] + << ", " << tensor->ne[1] << ", " << tensor->ne[2] << ", " << tensor->ne[3] << "]" << ", elements: " << n + << ", data:" << '\n'; + + switch (tensor->type) { + case GGML_TYPE_F32: { + const auto * data = static_cast(tensor->data); + for (size_t i = 0; i < n; ++i) { + out << data[i] << '\n'; + } + break; + } + case GGML_TYPE_F16: { + const auto * data = static_cast(tensor->data); + for (size_t i = 0; i < n; ++i) { + out << ggml_fp16_to_fp32(data[i]) << '\n'; + } + break; + } + case GGML_TYPE_BF16: { + const auto * data = static_cast(tensor->data); + for (size_t i = 0; i < n; ++i) { + out << ggml_bf16_to_fp32(data[i]) << '\n'; + } + break; + } + case GGML_TYPE_I32: { + const auto * data = static_cast(tensor->data); + for (size_t i = 0; i < n; ++i) { + out << data[i] << '\n'; + } + break; + } + case GGML_TYPE_I64: { + const auto * data = static_cast(tensor->data); + for (size_t i = 0; i < n; ++i) { + out << data[i] << '\n'; + } + break; + } + default: + out << "unsupported tensor type for text dump" << '\n'; + return false; + } + + return true; +} + void print_input_tensor_info(const std::string & name, const ov::Tensor & tensor) { std::cout << "Input name: " << name << ", Input shape: " << tensor.get_shape() << ", Address: " << tensor.data() << std::endl; @@ -849,13 +1145,6 @@ void print_output_tensor_info(const std::string & name, const ov::Tensor & tenso } } -void set_zero_diagonal(std::vector & matrix, size_t rows, size_t cols) { - for (size_t i = 0; i < rows; ++i) { - size_t diag_col = std::min(i, cols - 1); - matrix[i * cols + diag_col] = 0.0f; - } -} - const ggml_tensor * get_inp_pos_tensor(ggml_cgraph * cgraph) { for (int i = 0; i < cgraph->n_nodes; ++i) { auto * op = cgraph->nodes[i]; diff --git a/ggml/src/ggml-openvino/utils.h b/ggml/src/ggml-openvino/utils.h index 2c72e33c352f..c2c7b7cdabdf 100644 --- a/ggml/src/ggml-openvino/utils.h +++ b/ggml/src/ggml-openvino/utils.h @@ -1,4 +1,3 @@ -#include "ggml-backend-impl.h" #include "ggml-decoder.h" #include "ggml-impl.h" @@ -45,6 +44,7 @@ struct graph_key_hash { struct decoder_runtime_ctx { decoder_runtime_ctx(std::shared_ptr mutex) : mutex(std::move(mutex)) {} + std::shared_ptr mutex; std::shared_ptr ptr; }; @@ -64,11 +64,7 @@ struct ov_runtime_context { std::map kv_state_input_name_map; std::atomic backend_count; - ov_runtime_context() : - device("CPU"), - stateful(false), - stateful_kv_size(0), - backend_count(0) {} + ov_runtime_context() : device("CPU"), stateful(false), stateful_kv_size(0), backend_count(0) {} void clear_caches() { std::lock_guard lock(ctx_mutex); @@ -87,6 +83,8 @@ enum ggml_status ov_graph_compute_static(struct ggml_cgraph * cgraph, std::share size_t checksum(const void * data, size_t size); +bool save_ggml_tensor_data_to_txt(const ggml_tensor * tensor, const std::string & file_path); + void print_input_tensor_info(const std::string & name, const ov::Tensor & tensor); void print_output_tensor_info(const std::string & name, const ov::Tensor & tensor, const void * output_dst); @@ -117,8 +115,6 @@ std::vector pad_input(const ggml_tensor * tensor, size_t padded_rows, size_t padded_rows, padded_cols, pad_value); } -void set_zero_diagonal(std::vector & matrix, size_t rows, size_t cols); - const ggml_tensor * get_inp_pos_tensor(struct ggml_cgraph * cgraph); bool get_is_prefill(const ggml_tensor * inp_pos); @@ -137,6 +133,13 @@ ov::Tensor create_ov_output_tensor(std::shared_ptr ggml_decoder, bool is_naive(struct ggml_cgraph * cgraph); +/** + * @brief Heuristically checks whether the given computation graph is a split-model fragment. + * @param cgraph Pointer to the GGML computation graph to analyze. + * @return true if the graph is identified as split; otherwise false. + */ +bool is_model_splitted(struct ggml_cgraph * cgraph); + enum ggml_status naive_compute(struct ggml_cgraph * cgraph, ov::Core & core, const std::string & device, diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 15d231f70c0d..1ebc50a763f1 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -71,6 +71,44 @@ void quantize_row_q1_0_ref(const float * GGML_RESTRICT x, block_q1_0 * GGML_REST } } +void quantize_row_q2_0_ref(const float * GGML_RESTRICT x, block_q2_0 * GGML_RESTRICT y, int64_t k) { + static const int qk = QK2_0; + + assert(k % qk == 0); + + const int nb = k / qk; + + for (int i = 0; i < nb; i++) { + // Compute scale as max absolute value in the block + float amax = 0.0f; + for (int j = 0; j < qk; j++) { + const float a = fabsf(x[i*qk + j]); + if (a > amax) amax = a; + } + const float d = amax; + const float id = d > 0.0f ? 1.0f / d : 0.0f; + + y[i].d = GGML_FP32_TO_FP16(d); + + // Clear quant bytes + for (int j = 0; j < qk / 4; ++j) { + y[i].qs[j] = 0; + } + + // Encode 2-bit values: round(w/d) clamped to [-1, 2], then add 1 + // 00 (-1) = -scale, 01 (0) = 0, 10 (+1) = +scale, 11 (+2) = 2*scale + for (int j = 0; j < qk; ++j) { + const float w = x[i*qk + j]; + int q = (int)roundf(w * id) + 1; + if (q < 0) q = 0; + if (q > 3) q = 3; + const int byte_index = j / 4; + const int bit_offset = (j % 4) * 2; + y[i].qs[byte_index] |= ((uint8_t)q << bit_offset); + } + } +} + // reference implementation for deterministic creation of model files void quantize_row_q4_0_ref(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k) { static const int qk = QK4_0; @@ -398,6 +436,26 @@ void dequantize_row_q1_0(const block_q1_0 * GGML_RESTRICT x, float * GGML_RESTRI } } +void dequantize_row_q2_0(const block_q2_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { + static const int qk = QK2_0; + + assert(k % qk == 0); + + const int nb = k / qk; + + for (int i = 0; i < nb; i++) { + const float d = GGML_FP16_TO_FP32(x[i].d); + + for (int j = 0; j < qk; ++j) { + const int byte_index = j / 4; + const int bit_offset = (j % 4) * 2; + const uint8_t q = (x[i].qs[byte_index] >> bit_offset) & 0x03; + // 00=-1, 01=0, 10=+1, 11=+2 + y[i*qk + j] = ((int)q - 1) * d; + } + } +} + void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) { static const int qk = QK4_0; @@ -2052,6 +2110,20 @@ size_t quantize_q1_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, return nrow * row_size; } +size_t quantize_q2_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { + if (!quant_weights) { + quantize_row_q2_0_ref(src, dst, (int64_t)nrow*n_per_row); + return nrow * ggml_row_size(GGML_TYPE_Q2_0, n_per_row); + } + size_t row_size = ggml_row_size(GGML_TYPE_Q2_0, n_per_row); + char * qrow = (char *)dst; + for (int64_t row = 0; row < nrow; ++row) { + quantize_row_q2_0_ref(src, (block_q2_0*)qrow, n_per_row); + src += n_per_row; + qrow += row_size; + } + return nrow * row_size; +} size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { if (!quant_weights) { @@ -5461,6 +5533,10 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte { VALIDATE_ROW_DATA_D_F16_IMPL(block_q1_0, data, nb); } break; + case GGML_TYPE_Q2_0: + { + VALIDATE_ROW_DATA_D_F16_IMPL(block_q2_0, data, nb); + } break; case GGML_TYPE_Q4_0: { VALIDATE_ROW_DATA_D_F16_IMPL(block_q4_0, data, nb); diff --git a/ggml/src/ggml-quants.h b/ggml/src/ggml-quants.h index d56c86da8909..75188f1af180 100644 --- a/ggml/src/ggml-quants.h +++ b/ggml/src/ggml-quants.h @@ -15,6 +15,7 @@ extern "C" { // Quantization GGML_API void quantize_row_q1_0_ref(const float * GGML_RESTRICT x, block_q1_0 * GGML_RESTRICT y, int64_t k); +GGML_API void quantize_row_q2_0_ref(const float * GGML_RESTRICT x, block_q2_0 * GGML_RESTRICT y, int64_t k); GGML_API void quantize_row_q4_0_ref(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k); GGML_API void quantize_row_q4_1_ref(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k); GGML_API void quantize_row_q5_0_ref(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k); @@ -43,6 +44,7 @@ GGML_API void quantize_row_iq2_s_ref (const float * GGML_RESTRICT x, block_iq2_ // Dequantization GGML_API void dequantize_row_q1_0(const block_q1_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +GGML_API void dequantize_row_q2_0(const block_q2_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); GGML_API void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); GGML_API void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); GGML_API void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); @@ -93,6 +95,7 @@ GGML_API size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTR GGML_API size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); GGML_API size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); GGML_API size_t quantize_q1_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +GGML_API size_t quantize_q2_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); GGML_API size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); GGML_API size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); GGML_API size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); diff --git a/ggml/src/ggml-sycl/CMakeLists.txt b/ggml/src/ggml-sycl/CMakeLists.txt index 180de92202d5..1c17d20df12b 100644 --- a/ggml/src/ggml-sycl/CMakeLists.txt +++ b/ggml/src/ggml-sycl/CMakeLists.txt @@ -39,8 +39,8 @@ if (WIN32) set(CMAKE_CXX_COMPILER "icx") set(CMAKE_CXX_COMPILER_ID "IntelLLVM") endif() - # Level Zero SDK path for Windows (only when GGML_SYCL_SUPPORT_LEVEL_ZERO is enabled) - if(GGML_SYCL_SUPPORT_LEVEL_ZERO) + # Level Zero SDK path for Windows (only when GGML_SYCL_SUPPORT_LEVEL_ZERO_API is enabled) + if(GGML_SYCL_SUPPORT_LEVEL_ZERO_API) if(DEFINED ENV{LEVEL_ZERO_V1_SDK_PATH}) set(LEVEL_ZERO_V1_SDK_PATH $ENV{LEVEL_ZERO_V1_SDK_PATH}) if(EXISTS "${LEVEL_ZERO_V1_SDK_PATH}") @@ -105,8 +105,8 @@ endif() target_compile_options(ggml-sycl PRIVATE "-Wno-narrowing") -message(STATUS "GGML_SYCL_SUPPORT_LEVEL_ZERO ${GGML_SYCL_SUPPORT_LEVEL_ZERO}") -if (GGML_SYCL_SUPPORT_LEVEL_ZERO) +message(STATUS "GGML_SYCL_SUPPORT_LEVEL_ZERO_API ${GGML_SYCL_SUPPORT_LEVEL_ZERO_API}") +if (GGML_SYCL_SUPPORT_LEVEL_ZERO_API) # Link against Level Zero loader for direct device memory allocation. # Avoids sycl::malloc_device triggering DMA-buf/TTM system RAM staging # in the xe kernel driver during multi-GPU inference. @@ -114,7 +114,7 @@ if (GGML_SYCL_SUPPORT_LEVEL_ZERO) find_library(ZE_LOADER_LIB ze_loader HINTS ${ONEAPI_ROOT}/lib ${LEVEL_ZERO_V1_SDK_LIB_PATH} ENV LD_LIBRARY_PATH) if(ZE_LOADER_LIB AND LEVEL_ZERO_INCLUDE_DIR) target_link_libraries(ggml-sycl PRIVATE ${ZE_LOADER_LIB}) - target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_SUPPORT_LEVEL_ZERO) + target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_SUPPORT_LEVEL_ZERO_API) message(STATUS "Level Zero loader found: ${ZE_LOADER_LIB}") message(STATUS "Level Zero headers found: ${LEVEL_ZERO_INCLUDE_DIR}") else() diff --git a/ggml/src/ggml-sycl/backend.hpp b/ggml/src/ggml-sycl/backend.hpp index a526d8e58bc9..1f5a91272663 100644 --- a/ggml/src/ggml-sycl/backend.hpp +++ b/ggml/src/ggml-sycl/backend.hpp @@ -17,6 +17,7 @@ #include "common.hpp" #include "concat.hpp" #include "conv.hpp" +#include "conv3d.hpp" #include "convert.hpp" #include "count-equal.hpp" #include "cpy.hpp" diff --git a/ggml/src/ggml-sycl/binbcast.cpp b/ggml/src/ggml-sycl/binbcast.cpp index 92dd18889f4f..306eeddc0c0c 100644 --- a/ggml/src/ggml-sycl/binbcast.cpp +++ b/ggml/src/ggml-sycl/binbcast.cpp @@ -287,6 +287,18 @@ inline void ggml_sycl_op_bin_bcast(ggml_backend_sycl_context & ctx, const ggml_t ne10, ne11, ne12, ne13, ne0, ne1, ne2, ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb0, nb1, nb2, nb3, ggml_is_contiguous(src0), ggml_is_contiguous(src1), ggml_is_permuted(src0), ggml_is_permuted(src1), main_stream); +#ifdef GGML_SYCL_HAS_BF16 + } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_BF16) { + op()((const sycl::ext::oneapi::bfloat16 *) src0->data, (const sycl::ext::oneapi::bfloat16 *) src1->data, + (sycl::ext::oneapi::bfloat16 *) dst->data, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, ne0, ne1, ne2, + ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb0, nb1, nb2, nb3, ggml_is_contiguous(src0), + ggml_is_contiguous(src1), ggml_is_permuted(src0), ggml_is_permuted(src1), main_stream); + } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_BF16) { + op()((const sycl::ext::oneapi::bfloat16 *) src0->data, (const float *) src1->data, + (sycl::ext::oneapi::bfloat16 *) dst->data, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, ne0, ne1, ne2, + ne3, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb0, nb1, nb2, nb3, ggml_is_contiguous(src0), + ggml_is_contiguous(src1), ggml_is_permuted(src0), ggml_is_permuted(src1), main_stream); +#endif } else { fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__, ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type)); diff --git a/ggml/src/ggml-sycl/common.cpp b/ggml/src/ggml-sycl/common.cpp index ae08abad81bb..e1b6db13eb41 100644 --- a/ggml/src/ggml-sycl/common.cpp +++ b/ggml/src/ggml-sycl/common.cpp @@ -12,7 +12,7 @@ #include "common.hpp" #include -#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO +#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API #include #endif @@ -59,7 +59,7 @@ bool gpu_has_xmx(sycl::device &dev) { return dev.has(sycl::aspect::ext_intel_matrix); } -static int ggml_sycl_get_env(const char *env_name, int default_val) { +int ggml_sycl_get_env(const char *env_name, int default_val) { char *user_device_string = getenv(env_name); int user_number = default_val; @@ -84,9 +84,9 @@ int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block return sycl_down_blk_size; } -#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO +#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API static bool ggml_sycl_use_level_zero_device_alloc(sycl::queue &q) { - return ggml_sycl_get_env("GGML_SYCL_ENABLE_LEVEL_ZERO", 1) && + return g_ggml_sycl_use_level_zero_api && q.get_device().is_gpu() && q.get_backend() == sycl::backend::ext_oneapi_level_zero; } @@ -94,10 +94,8 @@ static bool ggml_sycl_use_level_zero_device_alloc(sycl::queue &q) { // Use Level Zero zeMemAllocDevice to avoid sycl::malloc_device triggering // DMA-buf/TTM system RAM staging in the xe kernel driver during multi-GPU inference. -// The decision is made from the queue and runtime env because large buffers can be -// allocated before ggml_check_sycl() initializes g_ggml_sycl_enable_level_zero. void * ggml_sycl_malloc_device(size_t size, sycl::queue &q) { -#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO +#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API if (ggml_sycl_use_level_zero_device_alloc(q)) { void *ptr = nullptr; auto ze_ctx = sycl::get_native(q.get_context()); @@ -129,7 +127,7 @@ void * ggml_sycl_malloc_device(size_t size, sycl::queue &q) { void ggml_sycl_free_device(void *ptr, sycl::queue &q) { if (!ptr) return; -#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO +#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API if (ggml_sycl_use_level_zero_device_alloc(q)) { auto ze_ctx = sycl::get_native(q.get_context()); zeMemFree(ze_ctx, ptr); diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index d8bb3638dfda..8534bd3581e5 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -62,6 +62,7 @@ extern int g_ggml_sycl_debug; extern int g_ggml_sycl_disable_optimize; extern int g_ggml_sycl_prioritize_dmmv; extern int g_ggml_sycl_enable_flash_attention; +extern int g_ggml_sycl_dev2dev_memcpy; #if defined(__clang__) && __has_builtin(__builtin_expect) @@ -126,6 +127,11 @@ enum ggml_sycl_backend_gpu_mode { SYCL_MUL_GPU_MODE }; +enum ggml_sycl_dev2dev_memcpy_mode { + DEV2DEV_MEMCPY_SYCL = 0, + DEV2DEV_MEMCPY_L0 = 1, +}; + static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size"); static void crash() { @@ -225,10 +231,12 @@ struct sycl_device_info { int max_wg_per_cu; // max work groups per compute unit - refer to // cudaOccupancyMaxActiveBlocksPerMultiprocessor bool vmm; // virtual memory support + bool l0_discrete_gpu; // Level Zero backend and not an integrated GPU size_t vmm_granularity; // granularity of virtual memory size_t total_vram; sycl_hw_info hw_info; optimize_feature opt_feature; + bool usm_system_support; // support for USM system allocations }; @@ -316,12 +324,17 @@ struct ggml_tensor_extra_gpu { optimize_feature optimized_feature; }; -extern int g_ggml_sycl_enable_level_zero; +extern int g_ggml_sycl_use_level_zero_api; void * ggml_sycl_malloc_device(size_t size, sycl::queue &q); void ggml_sycl_free_device(void *ptr, sycl::queue &q); void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector streams={}); +struct mmid_row_mapping { + int32_t i1; + int32_t i2; +}; + namespace sycl_ex = sycl::ext::oneapi::experimental; struct ggml_backend_sycl_context { int device; @@ -419,6 +432,8 @@ struct ggml_backend_sycl_context { std::unique_ptr host_pools[GGML_SYCL_MAX_DEVICES]; + std::vector mmid_row_mapping_host; + static std::unique_ptr new_pool_for_device(queue_ptr qptr, int device); static std::unique_ptr new_pool_for_host(queue_ptr qptr, int device); @@ -644,6 +659,8 @@ constexpr size_t ceil_div(const size_t m, const size_t n) { bool gpu_has_xmx(sycl::device &dev); +int ggml_sycl_get_env(const char *env_name, int default_val); + template std::string debug_get_array_str(const std::string & prefix, const T array[N]) { if (LIKELY(!g_ggml_sycl_debug)) { return ""; diff --git a/ggml/src/ggml-sycl/concat.cpp b/ggml/src/ggml-sycl/concat.cpp index d16215bc91cc..93e00d65fcd3 100644 --- a/ggml/src/ggml-sycl/concat.cpp +++ b/ggml/src/ggml-sycl/concat.cpp @@ -10,6 +10,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // +#include "ggml.h" + #include "concat.hpp" static inline size_t elem_size(ggml_type t) { @@ -192,11 +194,29 @@ void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { case GGML_TYPE_F32: concat_impl_sycl(ctx, dst); break; + case GGML_TYPE_F16: + concat_impl_sycl(ctx, dst); + break; +#ifdef GGML_SYCL_HAS_BF16 + case GGML_TYPE_BF16: + concat_impl_sycl(ctx, dst); + break; +#endif case GGML_TYPE_I32: concat_impl_sycl(ctx, dst); break; + case GGML_TYPE_I16: + concat_impl_sycl(ctx, dst); + break; + case GGML_TYPE_I64: + concat_impl_sycl(ctx, dst); + break; + case GGML_TYPE_I8: + concat_impl_sycl(ctx, dst); + break; default: - GGML_ASSERT(false && "ggml_sycl_op_concat: unsupported type"); + fprintf(stderr, "%s: unsupported types: dst: %s\n", __func__, ggml_type_name(dst->type)); + GGML_ASSERT(false); break; } } diff --git a/ggml/src/ggml-sycl/conv2d-dw.cpp b/ggml/src/ggml-sycl/conv2d-dw.cpp new file mode 100644 index 000000000000..0a52b79174c4 --- /dev/null +++ b/ggml/src/ggml-sycl/conv2d-dw.cpp @@ -0,0 +1,158 @@ +#include "conv2d-dw.hpp" + +struct conv2d_dw_params { + int in_w, in_h; + int out_w, out_h; + int kernel_w, kernel_h; + int stride_x, stride_y; + int padding_x, padding_y; + int dilation_x, dilation_y; + int channels, batches; +}; + +struct conv2d_dw_kernel_bounds { + int y_min, y_max; + int x_min, x_max; +}; + +static inline conv2d_dw_kernel_bounds dw_calculate_kernel_bounds(int out_x, int out_y, + const conv2d_dw_params & p) { + conv2d_dw_kernel_bounds bounds; + bounds.y_min = sycl::max(0, (p.padding_y - out_y * p.stride_y + p.dilation_y - 1) / p.dilation_y); + bounds.y_max = sycl::min(p.kernel_h, + (p.in_h + p.padding_y - out_y * p.stride_y + p.dilation_y - 1) / p.dilation_y); + bounds.x_min = sycl::max(0, (p.padding_x - out_x * p.stride_x + p.dilation_x - 1) / p.dilation_x); + bounds.x_max = sycl::min(p.kernel_w, + (p.in_w + p.padding_x - out_x * p.stride_x + p.dilation_x - 1) / p.dilation_x); + return bounds; +} + +static inline int dw_calculate_input_coord(int out_coord, int kern_coord, int stride, int dilation, int padding) { + return out_coord * stride + kern_coord * dilation - padding; +} + +// whcn layout: input/output stored as [N, C, H, W] +struct dw_whcn_layout { + static int input_index(int n, int c, int y, int x, const conv2d_dw_params & p) { + return n * (p.channels * p.in_w * p.in_h) + c * p.in_w * p.in_h + y * p.in_w + x; + } + static int kernel_index(int c, int ky, int kx, const conv2d_dw_params & p) { + return c * p.kernel_h * p.kernel_w + ky * p.kernel_w + kx; + } + static int output_index(int n, int c, int y, int x, const conv2d_dw_params & p) { + return n * (p.channels * p.out_w * p.out_h) + c * p.out_w * p.out_h + y * p.out_w + x; + } + static void unpack_indices(int global_idx, const conv2d_dw_params & p, + int & n, int & c, int & out_y, int & out_x) { + out_x = global_idx % p.out_w; + out_y = (global_idx / p.out_w) % p.out_h; + c = (global_idx / (p.out_w * p.out_h)) % p.channels; + n = global_idx / (p.out_w * p.out_h * p.channels); + } +}; + +// cwhn layout: input/output stored as [N, H, W, C] +struct dw_cwhn_layout { + static int input_index(int n, int c, int y, int x, const conv2d_dw_params & p) { + return n * (p.channels * p.in_w * p.in_h) + (y * p.in_w + x) * p.channels + c; + } + static int kernel_index(int c, int ky, int kx, const conv2d_dw_params & p) { + return (ky * p.kernel_w + kx) * p.channels + c; + } + static int output_index(int n, int c, int y, int x, const conv2d_dw_params & p) { + return n * (p.channels * p.out_w * p.out_h) + y * (p.out_w * p.channels) + x * p.channels + c; + } + static void unpack_indices(int global_idx, const conv2d_dw_params & p, + int & n, int & c, int & out_y, int & out_x) { + c = global_idx % p.channels; + out_x = (global_idx / p.channels) % p.out_w; + out_y = (global_idx / (p.channels * p.out_w)) % p.out_h; + n = global_idx / (p.channels * p.out_w * p.out_h); + } +}; + +template +static void conv2d_dw_kernel(const float * input, const float * kernel, float * output, + const conv2d_dw_params p, const sycl::nd_item<3> & item_ct1) { + const int global_idx = item_ct1.get_local_id(2) + + item_ct1.get_group(2) * item_ct1.get_local_range(2); + const int total_elements = p.batches * p.channels * p.out_h * p.out_w; + + if (global_idx >= total_elements) { + return; + } + + int n, c, out_y, out_x; + Layout::unpack_indices(global_idx, p, n, c, out_y, out_x); + + float acc = 0.0f; + const conv2d_dw_kernel_bounds bounds = dw_calculate_kernel_bounds(out_x, out_y, p); + + for (int ky = bounds.y_min; ky < bounds.y_max; ++ky) { + const int in_y = dw_calculate_input_coord(out_y, ky, p.stride_y, p.dilation_y, p.padding_y); + for (int kx = bounds.x_min; kx < bounds.x_max; ++kx) { + const int in_x = dw_calculate_input_coord(out_x, kx, p.stride_x, p.dilation_x, p.padding_x); + acc += input[Layout::input_index(n, c, in_y, in_x, p)] * + kernel[Layout::kernel_index(c, ky, kx, p)]; + } + } + + output[Layout::output_index(n, c, out_y, out_x, p)] = acc; +} + +template +static void conv2d_dw_sycl(const float * x_d, const float * w_d, float * y_d, + const conv2d_dw_params p, const queue_ptr & stream) { + const int total = p.batches * p.channels * p.out_h * p.out_w; + const int num_blocks = (total + SYCL_CONV2D_DW_BLOCK_SIZE - 1) / SYCL_CONV2D_DW_BLOCK_SIZE; + const sycl::range<3> block_dims(1, 1, SYCL_CONV2D_DW_BLOCK_SIZE); + const sycl::range<3> block_nums(1, 1, num_blocks); + stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + conv2d_dw_kernel(x_d, w_d, y_d, p, item_ct1); + }); +} + +void ggml_sycl_op_conv2d_dw(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2); + + const ggml_tensor * kernel = dst->src[0]; + const ggml_tensor * input = dst->src[1]; + + GGML_ASSERT(kernel->type == GGML_TYPE_F32 && input->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); + + const float * w_d = (const float *) kernel->data; + const float * x_d = (const float *) input->data; + float * y_d = (float *) dst->data; + + const int32_t * p = (const int32_t *) dst->op_params; + const int stride_x = p[0]; + const int stride_y = p[1]; + const int padding_x = p[2]; + const int padding_y = p[3]; + const int dilation_x = p[4]; + const int dilation_y = p[5]; + + const int in_w = input->ne[0]; + const int in_h = input->ne[1]; + const int kernel_w = kernel->ne[0]; + const int kernel_h = kernel->ne[1]; + const int out_w = dst->ne[0]; + const int out_h = dst->ne[1]; + const int channels = dst->ne[2]; + const int batches = dst->ne[3]; + + const conv2d_dw_params params = { in_w, in_h, out_w, out_h, kernel_w, kernel_h, + stride_x, stride_y, padding_x, padding_y, + dilation_x, dilation_y, channels, batches }; + + const queue_ptr stream = ctx.stream(); + + if (ggml_is_contiguous(input)) { + conv2d_dw_sycl(x_d, w_d, y_d, params, stream); + } else if (ggml_is_contiguous_channels(input)) { + conv2d_dw_sycl(x_d, w_d, y_d, params, stream); + } else { + GGML_ABORT("Unsupported memory layout for conv2d_dw"); + } +} diff --git a/ggml/src/ggml-sycl/conv2d-dw.hpp b/ggml/src/ggml-sycl/conv2d-dw.hpp new file mode 100644 index 000000000000..532892221930 --- /dev/null +++ b/ggml/src/ggml-sycl/conv2d-dw.hpp @@ -0,0 +1,10 @@ +#ifndef GGML_SYCL_CONV2D_DW_HPP +#define GGML_SYCL_CONV2D_DW_HPP + +#include "common.hpp" + +#define SYCL_CONV2D_DW_BLOCK_SIZE 256 + +void ggml_sycl_op_conv2d_dw(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +#endif // GGML_SYCL_CONV2D_DW_HPP diff --git a/ggml/src/ggml-sycl/conv2d-transpose.cpp b/ggml/src/ggml-sycl/conv2d-transpose.cpp new file mode 100644 index 000000000000..07c325cc6706 --- /dev/null +++ b/ggml/src/ggml-sycl/conv2d-transpose.cpp @@ -0,0 +1,125 @@ +#include "conv2d-transpose.hpp" +#include "convert.hpp" + +template +static void conv2d_transpose_kernel(const float * input, const kernel_t * kernel, float * output, + const int in_w, const int in_h, + const int out_w, const int out_h, + const int kernel_w, const int kernel_h, + const int stride, + const int c_in, const int c_out, const int batches, + const sycl::nd_item<3> & item_ct1) { + const int global_idx = item_ct1.get_local_id(2) + + item_ct1.get_group(2) * item_ct1.get_local_range(2); + const int total_elements = out_w * out_h * c_out * batches; + + if (global_idx >= total_elements) { + return; + } + + const int out_x = global_idx % out_w; + const int out_y = (global_idx / out_w) % out_h; + const int c_idx = (global_idx / (out_w * out_h)) % c_out; + const int n_idx = global_idx / (out_w * out_h * c_out); + + float acc = 0.0f; + + for (int c_in_idx = 0; c_in_idx < c_in; ++c_in_idx) { + for (int kh = 0; kh < kernel_h; ++kh) { + int in_y = out_y - kh; + if (in_y < 0 || in_y % stride) { + continue; + } + in_y /= stride; + if (in_y >= in_h) { + continue; + } + + for (int kw = 0; kw < kernel_w; ++kw) { + int in_x = out_x - kw; + if (in_x < 0 || in_x % stride) { + continue; + } + in_x /= stride; + if (in_x >= in_w) { + continue; + } + + const int input_idx = (in_w * in_h * c_in) * n_idx + (in_w * in_h) * c_in_idx + in_w * in_y + in_x; + const int kernel_idx = (kernel_h * kernel_w * c_out) * c_in_idx + (kernel_h * kernel_w) * c_idx + + kernel_w * kh + kw; + + acc += input[input_idx] * ggml_sycl_cast(kernel[kernel_idx]); + } + } + } + + output[(out_w * out_h * c_out) * n_idx + (out_w * out_h) * c_idx + out_w * out_y + out_x] = acc; +} + +template +static void conv2d_transpose_sycl(const float * input_d, const kernel_t * kernel_d, float * output_d, + const int in_w, const int in_h, + const int out_w, const int out_h, + const int kernel_w, const int kernel_h, + const int stride, + const int c_in, const int c_out, const int batches, + const queue_ptr & stream) { + const int total = out_w * out_h * c_out * batches; + const int num_blocks = (total + SYCL_CONV2D_TRANSPOSE_BLOCK_SIZE - 1) / SYCL_CONV2D_TRANSPOSE_BLOCK_SIZE; + const sycl::range<3> block_dims(1, 1, SYCL_CONV2D_TRANSPOSE_BLOCK_SIZE); + const sycl::range<3> block_nums(1, 1, num_blocks); + stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + conv2d_transpose_kernel(input_d, kernel_d, output_d, + in_w, in_h, out_w, out_h, kernel_w, kernel_h, + stride, c_in, c_out, batches, item_ct1); + }); +} + +// input: (W, H, C_in, N) +// kernel: (W, H, C_out, C_in) +// output: (W, H, C_out, N) +void ggml_sycl_op_conv2d_transpose(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2); + + const ggml_tensor * kernel = dst->src[0]; + const ggml_tensor * input = dst->src[1]; + + GGML_ASSERT(kernel->type == GGML_TYPE_F16 || kernel->type == GGML_TYPE_F32); + GGML_ASSERT(input->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32); + + GGML_ASSERT(ggml_is_contiguous(input)); + GGML_ASSERT(ggml_is_contiguous(kernel)); + GGML_ASSERT(ggml_is_contiguous(dst)); + + const float * input_d = (const float *) input->data; + float * output_d = (float *) dst->data; + const void * kernel_d = kernel->data; + + const int input_w = input->ne[0]; + const int input_h = input->ne[1]; + const int channels_in = input->ne[2]; + const int batches = input->ne[3]; + const int output_w = dst->ne[0]; + const int output_h = dst->ne[1]; + const int channels_out = kernel->ne[2]; + const int kernel_w = kernel->ne[0]; + const int kernel_h = kernel->ne[1]; + const int stride = dst->op_params[0]; + + GGML_ASSERT(channels_in == kernel->ne[3]); + GGML_ASSERT(stride > 0); + + const queue_ptr stream = ctx.stream(); + + if (kernel->type == GGML_TYPE_F16) { + conv2d_transpose_sycl(input_d, (const sycl::half *) kernel_d, output_d, + input_w, input_h, output_w, output_h, kernel_w, kernel_h, + stride, channels_in, channels_out, batches, stream); + } else { + conv2d_transpose_sycl(input_d, (const float *) kernel_d, output_d, + input_w, input_h, output_w, output_h, kernel_w, kernel_h, + stride, channels_in, channels_out, batches, stream); + } +} diff --git a/ggml/src/ggml-sycl/conv2d-transpose.hpp b/ggml/src/ggml-sycl/conv2d-transpose.hpp new file mode 100644 index 000000000000..ca067318ddee --- /dev/null +++ b/ggml/src/ggml-sycl/conv2d-transpose.hpp @@ -0,0 +1,10 @@ +#ifndef GGML_SYCL_CONV2D_TRANSPOSE_HPP +#define GGML_SYCL_CONV2D_TRANSPOSE_HPP + +#include "common.hpp" + +#define SYCL_CONV2D_TRANSPOSE_BLOCK_SIZE 256 + +void ggml_sycl_op_conv2d_transpose(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +#endif // GGML_SYCL_CONV2D_TRANSPOSE_HPP diff --git a/ggml/src/ggml-sycl/conv2d.cpp b/ggml/src/ggml-sycl/conv2d.cpp new file mode 100644 index 000000000000..3b3b49d05d3c --- /dev/null +++ b/ggml/src/ggml-sycl/conv2d.cpp @@ -0,0 +1,150 @@ +#include "conv2d.hpp" +#include "convert.hpp" + +struct conv2d_params { + const int64_t IW, IH; + const int64_t OW, OH; + const int64_t KW, KH; + const int64_t ST_X, ST_Y; + const int64_t PD_X, PD_Y; + const int64_t DL_X, DL_Y; + const int64_t IC, OC; + const int64_t B; + const int64_t TOTAL; +}; + +struct conv2d_kernel_bounds { + int64_t y_min, y_max; + int64_t x_min, x_max; +}; + +static inline int64_t conv2d_max64(int64_t a, int64_t b) { + return (a > b) ? a : b; +} + +static inline int64_t conv2d_min64(int64_t a, int64_t b) { + return (a < b) ? a : b; +} + +static inline conv2d_kernel_bounds calculate_kernel_bounds(int64_t out_x, int64_t out_y, const conv2d_params & P) { + conv2d_kernel_bounds bounds; + bounds.y_min = conv2d_max64(0, (P.PD_Y - out_y * P.ST_Y + P.DL_Y - 1) / P.DL_Y); + bounds.y_max = conv2d_min64(P.KH, (P.IH + P.PD_Y - out_y * P.ST_Y + P.DL_Y - 1) / P.DL_Y); + bounds.x_min = conv2d_max64(0, (P.PD_X - out_x * P.ST_X + P.DL_X - 1) / P.DL_X); + bounds.x_max = conv2d_min64(P.KW, (P.IW + P.PD_X - out_x * P.ST_X + P.DL_X - 1) / P.DL_X); + return bounds; +} + +static inline int calculate_input_coord(int64_t out_coord, int64_t kern_coord, int64_t stride, + int64_t dilation, int64_t padding) { + return out_coord * stride + kern_coord * dilation - padding; +} + +// whcn layout helpers (matching ggml tensor memory order) +static inline int64_t whcn_input_index(int64_t n, int64_t c, int64_t y, int64_t x, const conv2d_params & P) { + return n * (P.IC * P.IW * P.IH) + c * P.IW * P.IH + y * P.IW + x; +} + +static inline int64_t whcn_kernel_index(int64_t c_out, int64_t c_in, int64_t ky, int64_t kx, const conv2d_params & P) { + return c_out * (P.IC * P.KH * P.KW) + c_in * (P.KH * P.KW) + ky * P.KW + kx; +} + +static inline int64_t whcn_output_index(int64_t n, int64_t c, int64_t y, int64_t x, const conv2d_params & P) { + return n * (P.OC * P.OW * P.OH) + c * P.OW * P.OH + y * P.OW + x; +} + +template +static void conv2d_kernel(const float * input, const T * kernel, float * output, + const conv2d_params P, const sycl::nd_item<3> & item_ct1) { + const int64_t global_idx = item_ct1.get_local_id(2) + + item_ct1.get_group(2) * item_ct1.get_local_range(2); + + if (global_idx >= P.TOTAL) { + return; + } + + const int64_t out_x = global_idx % P.OW; + const int64_t out_y = (global_idx / P.OW) % P.OH; + const int64_t c_out = (global_idx / (P.OW * P.OH)) % P.OC; + const int64_t n = global_idx / (P.OW * P.OH * P.OC); + + float acc = 0.0f; + + const conv2d_kernel_bounds bounds = calculate_kernel_bounds(out_x, out_y, P); + + for (int64_t c_in = 0; c_in < P.IC; ++c_in) { + for (int64_t ky = bounds.y_min; ky < bounds.y_max; ++ky) { + const int64_t in_y = calculate_input_coord(out_y, ky, P.ST_Y, P.DL_Y, P.PD_Y); + for (int64_t kx = bounds.x_min; kx < bounds.x_max; ++kx) { + const int64_t in_x = calculate_input_coord(out_x, kx, P.ST_X, P.DL_X, P.PD_X); + const float input_val = input[whcn_input_index(n, c_in, in_y, in_x, P)]; + const T kernel_val = kernel[whcn_kernel_index(c_out, c_in, ky, kx, P)]; + acc += input_val * ggml_sycl_cast(kernel_val); + } + } + } + + output[whcn_output_index(n, c_out, out_y, out_x, P)] = acc; +} + +template +static void conv2d_sycl(const float * X_D, const T * K_D, float * Y_D, + const conv2d_params P, const queue_ptr & stream) { + const int num_blocks = (P.TOTAL + SYCL_CONV2D_BLOCK_SIZE - 1) / SYCL_CONV2D_BLOCK_SIZE; + const sycl::range<3> block_dims(1, 1, SYCL_CONV2D_BLOCK_SIZE); + const sycl::range<3> block_nums(1, 1, num_blocks); + stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + conv2d_kernel(X_D, K_D, Y_D, P, item_ct1); + }); +} + +void ggml_sycl_op_conv2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2); + + const ggml_tensor * kernel = dst->src[0]; + const ggml_tensor * input = dst->src[1]; + const float * K_D = (const float *) kernel->data; + const float * X_D = (const float *) input->data; + float * Y_D = (float *) dst->data; + + GGML_ASSERT(ggml_is_contiguous(kernel)); + GGML_ASSERT(kernel->type == GGML_TYPE_F16 || kernel->type == GGML_TYPE_F32); + GGML_ASSERT(input->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + // same number of input channels + GGML_ASSERT(input->ne[2] == kernel->ne[2]); + + const queue_ptr stream = ctx.stream(); + + const int32_t * p = (const int32_t *) dst->op_params; + const int ST_X = p[0]; + const int ST_Y = p[1]; + const int PD_X = p[2]; + const int PD_Y = p[3]; + const int DL_X = p[4]; + const int DL_Y = p[5]; + + // no cwhn layout support + GGML_ASSERT(p[6] == 0); + + const int IW = input->ne[0]; + const int IH = input->ne[1]; + const int OW = dst->ne[0]; + const int OH = dst->ne[1]; + const int KW = kernel->ne[0]; + const int KH = kernel->ne[1]; + const int IC = input->ne[2]; + const int OC = kernel->ne[3]; + const int B = input->ne[3]; + + const int64_t total = (int64_t) B * OC * OH * OW; + const conv2d_params params = { IW, IH, OW, OH, KW, KH, ST_X, ST_Y, PD_X, PD_Y, DL_X, DL_Y, IC, OC, B, total }; + + if (kernel->type == GGML_TYPE_F16) { + conv2d_sycl(X_D, (const sycl::half *) K_D, Y_D, params, stream); + } else { + conv2d_sycl(X_D, K_D, Y_D, params, stream); + } +} diff --git a/ggml/src/ggml-sycl/conv2d.hpp b/ggml/src/ggml-sycl/conv2d.hpp new file mode 100644 index 000000000000..efd25ab42a41 --- /dev/null +++ b/ggml/src/ggml-sycl/conv2d.hpp @@ -0,0 +1,10 @@ +#ifndef GGML_SYCL_CONV2D_HPP +#define GGML_SYCL_CONV2D_HPP + +#include "common.hpp" + +#define SYCL_CONV2D_BLOCK_SIZE 256 + +void ggml_sycl_op_conv2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +#endif // GGML_SYCL_CONV2D_HPP diff --git a/ggml/src/ggml-sycl/conv3d.cpp b/ggml/src/ggml-sycl/conv3d.cpp new file mode 100644 index 000000000000..3796562553c0 --- /dev/null +++ b/ggml/src/ggml-sycl/conv3d.cpp @@ -0,0 +1,224 @@ +#include "conv3d.hpp" + +static inline int64_t ggml_sycl_conv3d_calc_patch_total(const ggml_tensor * dst, int32_t n) { + return (int64_t) n * dst->ne[0] * dst->ne[1] * dst->ne[2]; +} + +static inline int64_t ggml_sycl_conv3d_calc_knl_n_total(const ggml_tensor * src0, int32_t c) { + return (int64_t) src0->ne[0] * src0->ne[1] * src0->ne[2] * c; +} + +static inline void ggml_sycl_conv3d_write_output( + const ggml_tensor * dst, + const float * src, float * dst_data, + int64_t patch_total, int64_t oc, + int64_t dst_w, int64_t dst_h, int64_t dst_d, + dpct::queue_ptr stream) { + const int64_t dst_nb0 = dst->nb[0]; + const int64_t dst_nb1 = dst->nb[1]; + const int64_t dst_nb2 = dst->nb[2]; + const int64_t dst_nb3 = dst->nb[3]; + const int64_t total = patch_total * oc; + const int64_t block_size = 256; + const int64_t num_work_items = ((total + block_size - 1) / block_size) * block_size; + + stream->parallel_for(sycl::range<1>(num_work_items), [=](sycl::id<1> id) { + const int64_t i = id[0]; + if (i >= total) { + return; + } + + const int64_t patch_idx = i / oc; + const int64_t out_ch = i % oc; + const int64_t p_in_batch = patch_idx % (dst_w * dst_h * dst_d); + const int64_t batch_idx = patch_idx / (dst_w * dst_h * dst_d); + const int64_t dst_z = p_in_batch / (dst_w * dst_h); + const int64_t dst_y = (p_in_batch % (dst_w * dst_h)) / dst_w; + const int64_t dst_x = p_in_batch % dst_w; + const int64_t ocn_idx = batch_idx * oc + out_ch; + + const int64_t dst_offset = dst_x * dst_nb0 + dst_y * dst_nb1 + dst_z * dst_nb2 + ocn_idx * dst_nb3; + // `src` is a column-major (m x n) GEMM output where m == patch_total, n == oc. + // GEMM stores element (row, col) at index `row + col*m`, so compute index accordingly. + const int64_t src_index = patch_idx + out_ch * patch_total; + const float value = src[src_index]; + *(float *)((char *)dst_data + dst_offset) = value; + }); +} + +void ggml_sycl_op_conv_3d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2); + + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + + GGML_ASSERT(src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(ggml_is_contiguous(src0)); + GGML_ASSERT(ggml_is_contiguous(src1)); + + const int32_t * opts = (const int32_t *) dst->op_params; + const int32_t s0 = opts[0]; + const int32_t s1 = opts[1]; + const int32_t s2 = opts[2]; + const int32_t p0 = opts[3]; + const int32_t p1 = opts[4]; + const int32_t p2 = opts[5]; + const int32_t d0 = opts[6]; + const int32_t d1 = opts[7]; + const int32_t d2 = opts[8]; + const int32_t c = opts[9]; + const int32_t n = opts[10]; + const int32_t oc = opts[11]; + + const int64_t knl_w = src0->ne[0]; + const int64_t knl_h = src0->ne[1]; + const int64_t knl_d = src0->ne[2]; + + const int64_t patch_total = ggml_sycl_conv3d_calc_patch_total(dst, n); + const int64_t knl_n_total = ggml_sycl_conv3d_calc_knl_n_total(src0, c); + + const size_t kernel_type_size = ggml_element_size(src0); + + ggml_sycl_pool_alloc gemm_output(ctx.pool()); + gemm_output.alloc((size_t) patch_total * oc); + + ggml_tensor dst_mat = {}; + dst_mat.type = GGML_TYPE_F32; + dst_mat.ne[0] = patch_total; + dst_mat.ne[1] = oc; + dst_mat.ne[2] = 1; + dst_mat.ne[3] = 1; + dst_mat.nb[0] = sizeof(float); + dst_mat.nb[1] = dst_mat.nb[0] * dst_mat.ne[0]; + dst_mat.nb[2] = dst_mat.nb[1]; + dst_mat.nb[3] = dst_mat.nb[2]; + dst_mat.data = gemm_output.get(); + dst_mat.buffer = dst->buffer; + dst_mat.extra = dst->extra; + + dpct::queue_ptr stream = ctx.stream(); + + // allocate packed arrays: A_packed (k x m), B_packed (k x n) + ggml_sycl_pool_alloc A_packed_alloc(ctx.pool()); + ggml_sycl_pool_alloc B_packed_alloc(ctx.pool()); + A_packed_alloc.alloc((size_t) knl_n_total * patch_total); + B_packed_alloc.alloc((size_t) knl_n_total * oc); + + float * A_packed = A_packed_alloc.get(); + float * B_packed = B_packed_alloc.get(); + + const int m = (int) patch_total; + const int n_gemm = (int) oc; + const int k = (int) knl_n_total; + + // Combined kernel: im2col -> pack A, and pack B simultaneously + const char * src1_base = (const char *) src1->data; + const char * src0_base = (const char *) src0->data; + const int64_t src1_nb0 = src1->nb[0]; + const int64_t src1_nb1 = src1->nb[1]; + const int64_t src1_nb2 = src1->nb[2]; + const int64_t src1_nb3 = src1->nb[3]; + const int64_t src1_w = src1->ne[0]; + const int64_t src1_h = src1->ne[1]; + const int64_t src1_d = src1->ne[2]; + + const bool src0_is_f32 = (src0->type == GGML_TYPE_F32); + + // Compute correct strides for src0 as (knl_n_total, oc) matrix + const int64_t src0_packed_nb0 = kernel_type_size; + const int64_t src0_packed_nb1 = kernel_type_size * knl_n_total; + + const int64_t KW = knl_w; + const int64_t KH = knl_h; + const int64_t KD = knl_d; + const int64_t PW = dst->ne[0]; + const int64_t PH = dst->ne[1]; + const int64_t PD = dst->ne[2]; + + // Pack A (with inline im2col): for each (row, col) in k x m matrix + const int64_t A_total = (int64_t)k * m; + const int64_t A_block_size = 256; + const int64_t A_num_work = ((A_total + A_block_size - 1) / A_block_size) * A_block_size; + + stream->parallel_for(sycl::range<1>(A_num_work), [=](sycl::id<1> id) { + const int64_t t = id[0]; + if (t >= A_total) return; + + const int64_t row = t % k; + const int64_t col = t / k; + + // Inline im2col for this element + const int64_t k_index = row; + const int64_t patch_idx = col; + + const int64_t ic = k_index / (KD * KH * KW); + const int64_t rem = k_index - ic * (KD * KH * KW); + const int64_t kz = rem / (KH * KW); + const int64_t rem2 = rem - kz * (KH * KW); + const int64_t ky = rem2 / KW; + const int64_t kx = rem2 % KW; + + const int64_t p_in_batch = patch_idx % (PW * PH * PD); + const int64_t batch_idx = patch_idx / (PW * PH * PD); + const int64_t dst_z = p_in_batch / (PW * PH); + const int64_t dst_y = (p_in_batch % (PW * PH)) / PW; + const int64_t dst_x = p_in_batch % PW; + + const int64_t sx = dst_x * s0 + kx * d0 - p0; + const int64_t sy = dst_y * s1 + ky * d1 - p1; + const int64_t sz = dst_z * s2 + kz * d2 - p2; + + float val = 0.0f; + if (sx >= 0 && sx < src1_w && sy >= 0 && sy < src1_h && sz >= 0 && sz < src1_d) { + const int64_t channel_idx = batch_idx * c + ic; + const char * ptr = src1_base + sx * src1_nb0 + sy * src1_nb1 + sz * src1_nb2 + channel_idx * src1_nb3; + val = *(const float *) ptr; + } + A_packed[row + col * (int64_t)k] = val; + }); + + // Pack B: for each (row, col) in k x n_gemm matrix + const int64_t B_total = (int64_t)k * n_gemm; + const int64_t B_block_size = 256; + const int64_t B_num_work = ((B_total + B_block_size - 1) / B_block_size) * B_block_size; + + stream->parallel_for(sycl::range<1>(B_num_work), [=](sycl::id<1> id) { + const int64_t t = id[0]; + if (t >= B_total) return; + + const int64_t row = t % k; + const int64_t col = t / k; + const char * src_ptr = src0_base + row * src0_packed_nb0 + col * src0_packed_nb1; + float v; + if (src0_is_f32) { + v = *(const float *) src_ptr; + } else { + v = sycl::vec(*(const sycl::half *) src_ptr).convert()[0]; + } + B_packed[row + col * (int64_t)k] = v; + }); + + // GEMM: C = A^T * B where A is (k x m), B is (k x n), C is (m x n) + const float alpha = 1.0f; + const float beta = 0.0f; + const int lda = k; + const int ldb = k; + const int ldc = m; + + SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm( + *stream, oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, + m, n_gemm, k, + dpct::get_value(&alpha, *stream), + (const float *) A_packed, lda, + (const float *) B_packed, ldb, + dpct::get_value(&beta, *stream), + (float *) dst_mat.data, ldc))); + + const float * gemm_data = (const float *) dst_mat.data; + float * dst_data = (float *) dst->data; + + ggml_sycl_conv3d_write_output(dst, gemm_data, dst_data, patch_total, oc, + dst->ne[0], dst->ne[1], dst->ne[2], stream); +} diff --git a/ggml/src/ggml-sycl/conv3d.hpp b/ggml/src/ggml-sycl/conv3d.hpp new file mode 100644 index 000000000000..5852f393f1b2 --- /dev/null +++ b/ggml/src/ggml-sycl/conv3d.hpp @@ -0,0 +1,8 @@ +#ifndef GGML_SYCL_CONV3D_HPP +#define GGML_SYCL_CONV3D_HPP + +#include "common.hpp" + +void ggml_sycl_op_conv_3d(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +#endif // GGML_SYCL_CONV3D_HPP diff --git a/ggml/src/ggml-sycl/convert.cpp b/ggml/src/ggml-sycl/convert.cpp index 65593402e7de..060d0aca2db4 100644 --- a/ggml/src/ggml-sycl/convert.cpp +++ b/ggml/src/ggml-sycl/convert.cpp @@ -642,6 +642,8 @@ static void convert_unary_sycl(const void * vx, dst_t * y, const int64_t k, dpct to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) { switch (type) { + case GGML_TYPE_Q1_0: + return dequantize_block_sycl; case GGML_TYPE_Q4_0: if (dst->src[0]->extra && ((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) { @@ -724,6 +726,8 @@ to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type, ggml_tensor * dst) { to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type, ggml_tensor *dst) { switch (type) { + case GGML_TYPE_Q1_0: + return dequantize_block_sycl; case GGML_TYPE_Q4_0: if (dst->src[0]->extra && ((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) { @@ -830,6 +834,8 @@ to_fp16_nc_sycl_t ggml_get_to_fp16_nc_sycl(ggml_type type) { case GGML_TYPE_BF16: return convert_unary_nc_sycl; #endif + case GGML_TYPE_Q1_0: + return dequantize_block_nc_sycl; case GGML_TYPE_Q4_0: return dequantize_block_nc_sycl; case GGML_TYPE_Q4_1: diff --git a/ggml/src/ggml-sycl/cpy.hpp b/ggml/src/ggml-sycl/cpy.hpp index 3c331f1ef27b..62ff34c87933 100644 --- a/ggml/src/ggml-sycl/cpy.hpp +++ b/ggml/src/ggml-sycl/cpy.hpp @@ -48,6 +48,287 @@ inline void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) { } } +inline void cpy_blck_f32_q1_0(const char * cxi, char * cdsti) { + const float * xi = (const float *) cxi; + block_q1_0 * dsti = (block_q1_0 *) cdsti; + + float sum_abs = 0.0f; + for (int j = 0; j < QK1_0; ++j) { + sum_abs += sycl::fabs((float) xi[j]); + } + + dsti->d = sum_abs / QK1_0; + + for (int j = 0; j < QK1_0 / 8; ++j) { + dsti->qs[j] = 0; + } + + for (int j = 0; j < QK1_0; ++j) { + if (xi[j] >= 0.0f) { + dsti->qs[j / 8] |= (1u << (j % 8)); + } + } +} + +inline int best_index_mxfp4(const float x, const float e) { + int best_index = 0; + float best_err = sycl::fabs((float) (kvalues_mxfp4[0] * e - x)); + for (int i = 1; i < 16; ++i) { + const float err = sycl::fabs((float) (kvalues_mxfp4[i] * e - x)); + if (err < best_err) { + best_index = i; + best_err = err; + } + } + return best_index; +} + +inline int nearest_int_sycl(float x) { + const float val = x + 12582912.0f; + int i; + memcpy(&i, &val, sizeof(int)); + return (i & 0x007fffff) - 0x00400000; +} + +inline int nearest_int_ggml_sycl(float x) { + return (int) sycl::round((float) x); +} + +inline uint8_t clamp_u8(const int x, const int lo, const int hi) { + return (uint8_t) dpct::max(lo, dpct::min(hi, x)); +} + +inline int8_t clamp_i8(const int x, const int lo, const int hi) { + return (int8_t) dpct::max(lo, dpct::min(hi, x)); +} + +constexpr float GROUP_MAX_EPS_SYCL = 1e-15f; + +inline float make_qx_quants_sycl(int n, int nmax, const float * x, int8_t * L, int rmse_type, const float * qw) { + float max = 0.0f; + float amax = 0.0f; + for (int i = 0; i < n; ++i) { + const float ax = sycl::fabs(x[i]); + if (ax > amax) { + amax = ax; + max = x[i]; + } + } + if (amax < GROUP_MAX_EPS_SYCL) { + for (int i = 0; i < n; ++i) { + L[i] = 0; + } + return 0.0f; + } + + float iscale = -nmax / max; + if (rmse_type == 0) { + for (int i = 0; i < n; ++i) { + int l = nearest_int_ggml_sycl(iscale * x[i]); + L[i] = (int8_t) (nmax + dpct::max(-nmax, dpct::min(nmax - 1, l))); + } + return 1.0f / iscale; + } + + bool return_early = false; + if (rmse_type < 0) { + rmse_type = -rmse_type; + return_early = true; + } + + float sumlx = 0.0f; + float suml2 = 0.0f; + for (int i = 0; i < n; ++i) { + int l = nearest_int_ggml_sycl(iscale * x[i]); + l = dpct::max(-nmax, dpct::min(nmax - 1, l)); + L[i] = (int8_t) (l + nmax); + + const float w = qw ? qw[i] : (rmse_type == 1 ? x[i] * x[i] : + rmse_type == 2 ? 1.0f : rmse_type == 3 ? sycl::fabs(x[i]) : sycl::sqrt(sycl::fabs(x[i]))); + + sumlx += w * x[i] * l; + suml2 += w * l * l; + } + + float scale = suml2 ? sumlx / suml2 : 0.0f; + if (return_early) { + return suml2 > 0.0f ? 0.5f * (scale + 1.0f / iscale) : 1.0f / iscale; + } + + float best = scale * sumlx; + for (int is = -9; is <= 9; ++is) { + if (is == 0) { + continue; + } + iscale = -(nmax + 0.1f * is) / max; + sumlx = 0.0f; + suml2 = 0.0f; + for (int i = 0; i < n; ++i) { + int l = nearest_int_ggml_sycl(iscale * x[i]); + l = dpct::max(-nmax, dpct::min(nmax - 1, l)); + const float w = qw ? qw[i] : (rmse_type == 1 ? x[i] * x[i] : + rmse_type == 2 ? 1.0f : rmse_type == 3 ? sycl::fabs(x[i]) : sycl::sqrt(sycl::fabs(x[i]))); + sumlx += w * x[i] * l; + suml2 += w * l * l; + } + + if (suml2 > 0.0f && sumlx * sumlx > best * suml2) { + for (int i = 0; i < n; ++i) { + int l = nearest_int_ggml_sycl(iscale * x[i]); + L[i] = (int8_t) (nmax + dpct::max(-nmax, dpct::min(nmax - 1, l))); + } + scale = sumlx / suml2; + best = scale * sumlx; + } + } + + return scale; +} + +inline float make_q3_quants_sycl(int n, int nmax, const float * x, int8_t * L, bool do_rmse) { + float max = 0.0f; + float amax = 0.0f; + for (int i = 0; i < n; ++i) { + const float ax = sycl::fabs(x[i]); + if (ax > amax) { + amax = ax; + max = x[i]; + } + } + + if (amax < GROUP_MAX_EPS_SYCL) { + for (int i = 0; i < n; ++i) { + L[i] = 0; + } + return 0.0f; + } + + const float iscale = -nmax / max; + if (do_rmse) { + float sumlx = 0.0f; + float suml2 = 0.0f; + for (int i = 0; i < n; ++i) { + int l = nearest_int_ggml_sycl(iscale * x[i]); + l = dpct::max(-nmax, dpct::min(nmax - 1, l)); + L[i] = (int8_t) l; + const float w = x[i] * x[i]; + sumlx += w * x[i] * l; + suml2 += w * l * l; + } + + for (int itry = 0; itry < 5; ++itry) { + int n_changed = 0; + for (int i = 0; i < n; ++i) { + const float w = x[i] * x[i]; + float slx = sumlx - w * x[i] * L[i]; + if (slx > 0.0f) { + float sl2 = suml2 - w * L[i] * L[i]; + int new_l = nearest_int_ggml_sycl(x[i] * sl2 / slx); + new_l = dpct::max(-nmax, dpct::min(nmax - 1, new_l)); + if (new_l != L[i]) { + slx += w * x[i] * new_l; + sl2 += w * new_l * new_l; + if (sl2 > 0.0f && slx * slx * suml2 > sumlx * sumlx * sl2) { + L[i] = (int8_t) new_l; + sumlx = slx; + suml2 = sl2; + ++n_changed; + } + } + } + } + if (!n_changed) { + break; + } + } + + for (int i = 0; i < n; ++i) { + L[i] += nmax; + } + return suml2 > 0.0f ? sumlx / suml2 : 0.0f; + } + + for (int i = 0; i < n; ++i) { + int l = nearest_int_ggml_sycl(iscale * x[i]); + l = dpct::max(-nmax, dpct::min(nmax - 1, l)); + L[i] = (int8_t) (l + nmax); + } + + return 1.0f / iscale; +} + +inline void set_scale_min_k4(int j, uint8_t * q, uint8_t d, uint8_t m) { + if (j < 4) { + q[j] = (q[j] & 0xC0) | (d & 0x3F); + q[j + 4] = (q[j + 4] & 0xC0) | (m & 0x3F); + } else { + q[j + 4] = (d & 0x0F) | ((m & 0x0F) << 4); + q[j - 4] = (q[j - 4] & 0x3F) | ((d >> 4) << 6); + q[j - 0] = (q[j - 0] & 0x3F) | ((m >> 4) << 6); + } +} + +inline void get_scale_min_k4_local(int j, const uint8_t * q, uint8_t & d, uint8_t & m) { + if (j < 4) { + d = q[j] & 63; + m = q[j + 4] & 63; + } else { + d = (q[j + 4] & 0xF) | ((q[j - 4] >> 6) << 4); + m = (q[j + 4] >> 4) | ((q[j - 0] >> 6) << 4); + } +} + +inline void cpy_blck_f32_mxfp4(const char * cxi, char * cdsti) { + const float * xi = (const float *) cxi; + block_mxfp4 * dsti = (block_mxfp4 *) cdsti; + + float amax = 0.0f; + for (int j = 0; j < QK_MXFP4; ++j) { + amax = sycl::fmax(amax, sycl::fabs((float) xi[j])); + } + + const uint8_t e = amax > 0.0f ? (uint8_t) (sycl::floor(sycl::log2(amax)) - 2 + 127) : 0; + const float d = GGML_E8M0_TO_FP32_HALF(e); + + dsti->e = e; + + for (int j = 0; j < QK_MXFP4 / 2; ++j) { + const uint8_t x0 = best_index_mxfp4(xi[0 + j], d); + const uint8_t x1 = best_index_mxfp4(xi[QK_MXFP4 / 2 + j], d); + + dsti->qs[j] = x0; + dsti->qs[j] |= x1 << 4; + } +} + +inline void cpy_blck_f32_nvfp4(const char * cxi, char * cdsti) { + const float * xi = (const float *) cxi; + block_nvfp4 * dsti = (block_nvfp4 *) cdsti; + + constexpr int n_sub = QK_NVFP4 / QK_NVFP4_SUB; + + for (int s = 0; s < n_sub; ++s) { + const float * xb = xi + s * QK_NVFP4_SUB; + + float amax = 0.0f; + for (int j = 0; j < QK_NVFP4_SUB; ++j) { + amax = sycl::fmax(amax, sycl::fabs((float) xb[j])); + } + + const uint8_t ue = ggml_fp32_to_ue4m3(amax / 6.0f); + dsti->d[s] = ue; + const float d = ggml_ue4m3_to_fp32(ue); + + for (int j = 0; j < QK_NVFP4_SUB / 2; ++j) { + const uint8_t x0 = best_index_mxfp4(xb[0 + j], d); + const uint8_t x1 = best_index_mxfp4(xb[QK_NVFP4_SUB / 2 + j], d); + + dsti->qs[s * (QK_NVFP4_SUB / 2) + j] = x0 | (x1 << 4); + } + } +} + + inline void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) { const float * xi = (const float *) cxi; block_q4_0 * dsti = (block_q4_0 *) cdsti; diff --git a/ggml/src/ggml-sycl/dequantize.hpp b/ggml/src/ggml-sycl/dequantize.hpp index ca8cd96c08c3..7b66c73b0cf9 100644 --- a/ggml/src/ggml-sycl/dequantize.hpp +++ b/ggml/src/ggml-sycl/dequantize.hpp @@ -70,6 +70,21 @@ static __dpct_inline__ void dequantize_q4_0_reorder(const void *d_ptr, const int #endif // GGML_SYCL_F16 } +static __dpct_inline__ void dequantize_q1_0_reorder(const void *d_ptr, const int64_t ib, const void *qs, + const int iqs, dfloat2 &v) { + // Q1_0 reorder layout: scale values followed by quantized bits + const dfloat d = (const dfloat)*((const sycl::half*)d_ptr+ib); + + const int bit_index_0 = iqs + 0; + const int bit_index_1 = iqs + 1; + + const int bit_0 = (*((const uint8_t *)qs + bit_index_0 / 8) >> (bit_index_0 % 8)) & 1; + const int bit_1 = (*((const uint8_t *)qs + bit_index_1 / 8) >> (bit_index_1 % 8)) & 1; + + v.x() = (2 * bit_0 - 1) * d; + v.y() = (2 * bit_1 - 1) * d; +} + static __dpct_inline__ void dequantize_q4_1(const void *vx, const int64_t ib, const int iqs, dfloat2 &v) { const block_q4_1 * x = (const block_q4_1 *) vx; diff --git a/ggml/src/ggml-sycl/dmmv.cpp b/ggml/src/ggml-sycl/dmmv.cpp index d80b0a382193..fb8a1757f191 100644 --- a/ggml/src/ggml-sycl/dmmv.cpp +++ b/ggml/src/ggml-sycl/dmmv.cpp @@ -256,13 +256,6 @@ static void convert_mul_mat_vec_bf16_sycl(const void *vx, const dfloat *y, } #endif -/* -DPCT1110:4: The total declared local variable size in device function -dequantize_mul_mat_vec_q2_k exceeds 128 bytes and may cause high register -pressure. Consult with your hardware vendor to find the total register size -available and adjust the code, or use smaller sub-group size to avoid high -register pressure. -*/ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx, const float *__restrict__ yy, float *__restrict__ dst, @@ -284,19 +277,15 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx, #if QK_K == 256 const int tid = - item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...15 + item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...7 or 0...15 const int ix = item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1 const int step = 16/K_QUANTS_PER_ITERATION; - const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... - const int in = tid - step*im; // 0...15 or 0...7 + const int in = tid % step; // 0...15 or 0...7 const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 or 0...14 in steps of 2 - const int q_offset = 32*im + l0; - const int s_offset = 8*im; - const int y_offset = 128*im + l0; uint32_t aux[4]; const uint8_t * d = (const uint8_t *)aux; @@ -304,33 +293,39 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx, for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { - const float * y = yy + i * QK_K + y_offset; - const uint8_t * q = x[i].qs + q_offset; - const float dall = x[i].dm[0]; const float dmin = x[i].dm[1]; - const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset); - aux[0] = a[0] & 0x0f0f0f0f; - aux[1] = a[1] & 0x0f0f0f0f; - aux[2] = (a[0] >> 4) & 0x0f0f0f0f; - aux[3] = (a[1] >> 4) & 0x0f0f0f0f; - - float sum1 = 0, sum2 = 0; - for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { - sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3) - + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3) - + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3) - + y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3) - + y[l+16] * d[1] * ((q[l+16] >> 0) & 3) - + y[l+48] * d[3] * ((q[l+16] >> 2) & 3) - + y[l+80] * d[5] * ((q[l+16] >> 4) & 3) - +y[l+112] * d[7] * ((q[l+16] >> 6) & 3); - sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6] - + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7]; + for (int im = 0; im < 2; ++im) { + const int q_offset = 32*im + l0; + const int s_offset = 8*im; + const int y_offset = 128*im + l0; + + const float * y = yy + i * QK_K + y_offset; + const uint8_t * q = x[i].qs + q_offset; + + const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset); + aux[0] = a[0] & 0x0f0f0f0f; + aux[1] = a[1] & 0x0f0f0f0f; + aux[2] = (a[0] >> 4) & 0x0f0f0f0f; + aux[3] = (a[1] >> 4) & 0x0f0f0f0f; + + float sum1 = 0, sum2 = 0; + for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { + sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3) + + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3) + + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3) + + y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3) + + y[l+16] * d[1] * ((q[l+16] >> 0) & 3) + + y[l+48] * d[3] * ((q[l+16] >> 2) & 3) + + y[l+80] * d[5] * ((q[l+16] >> 4) & 3) + +y[l+112] * d[7] * ((q[l+16] >> 6) & 3); + sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6] + + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7]; + } + tmp += dall * sum1 - dmin * sum2; } - tmp += dall * sum1 - dmin * sum2; } #else @@ -372,7 +367,7 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx, // sum up partial sums and write back result #pragma unroll - for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) { + for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { tmp += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); } @@ -382,13 +377,6 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx, } } -/* -DPCT1110:5: The total declared local variable size in device function -dequantize_mul_mat_vec_q3_k exceeds 128 bytes and may cause high register -pressure. Consult with your hardware vendor to find the total register size -available and adjust the code, or use smaller sub-group size to avoid high -register pressure. -*/ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx, const float *__restrict__ yy, float *__restrict__ dst, @@ -412,52 +400,52 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx, const uint16_t kmask2 = 0x0f0f; const int tid = - item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16 + item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...7 or 0...15 const int ix = item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1 const int n = K_QUANTS_PER_ITERATION; // iterations in the inner loop const int step = 16/K_QUANTS_PER_ITERATION; - const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... - const int in = tid - step*im; // 0....15 or 0...7 - - const uint8_t m = 1 << (4*im); + const int in = tid % step; // 0...15 or 0...7 const int l0 = n*in; // 0...15 or 0...14 in steps of 2 - const int q_offset = 32*im + l0; - const int y_offset = 128*im + l0; uint16_t utmp[4]; const int8_t * s = (const int8_t *)utmp; - const uint16_t s_shift = 4*im; - for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { - const float * y = yy + i * QK_K + y_offset; - const uint8_t * q = x[i].qs + q_offset; const uint8_t * h = x[i].hmask + l0; - - const uint16_t * a = (const uint16_t *)x[i].scales; - utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4); - utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4); - utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4); - utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4); - const float d = x[i].d; - float sum = 0; - for (int l = 0; l < n; ++l) { - sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4)) - + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4)) - + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4)) - + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4)); - sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4)) - + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4)) - + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4)) - + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4)); + for (int im = 0; im < 2; ++im) { + const int q_offset = 32*im + l0; + const int y_offset = 128*im + l0; + const uint16_t s_shift = 4*im; + const uint8_t m = 1 << (4*im); + + const float * y = yy + i * QK_K + y_offset; + const uint8_t * q = x[i].qs + q_offset; + + const uint16_t * a = (const uint16_t *)x[i].scales; + utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4); + utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4); + utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4); + utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4); + + float sum = 0; + for (int l = 0; l < n; ++l) { + sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4)) + + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4)) + + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4)) + + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4)); + sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4)) + + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4)) + + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4)) + + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4)); + } + tmp += d * sum; } - tmp += d * sum; } #else @@ -491,7 +479,7 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx, // sum up partial sums and write back result #pragma unroll - for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) { + for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { tmp += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); } @@ -530,53 +518,54 @@ static void dequantize_mul_mat_vec_q3_k_reorder(const void *__restrict__ vx, const uint16_t kmask2 = 0x0f0f; const int tid = - item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16 + item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...7 or 0...15 const int ix = item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1 const int n = K_QUANTS_PER_ITERATION; // iterations in the inner loop const int step = 16/K_QUANTS_PER_ITERATION; - const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... - const int in = tid - step*im; // 0....15 or 0...7 - - const uint8_t m = 1 << (4*im); + const int in = tid % step; // 0...15 or 0...7 const int l0 = n*in; // 0...15 or 0...14 in steps of 2 - const int q_offset = 32*im + l0; - const int y_offset = 128*im + l0; uint16_t utmp[4]; const int8_t * s = (const int8_t *)utmp; - const uint16_t s_shift = 4*im; - for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { const int bi = ib0 + i; - const float * y = yy + i * QK_K + y_offset; - const uint8_t * q = qs_base + bi * (QK_K / 4) + q_offset; const uint8_t * h = hmask_base + bi * (QK_K / 8) + l0; - const uint16_t * a = (const uint16_t *)(scales_base + bi * 12); - utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4); - utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4); - utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4); - utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4); - const float d = d_base[bi]; - float sum = 0; - for (int l = 0; l < n; ++l) { - sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4)) - + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4)) - + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4)) - + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4)); - sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4)) - + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4)) - + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4)) - + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4)); + for (int im = 0; im < 2; ++im) { + const int q_offset = 32*im + l0; + const int y_offset = 128*im + l0; + const uint16_t s_shift = 4*im; + const uint8_t m = 1 << (4*im); + + const float * y = yy + i * QK_K + y_offset; + const uint8_t * q = qs_base + bi * (QK_K / 4) + q_offset; + + const uint16_t * a = (const uint16_t *)(scales_base + bi * 12); + utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4); + utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4); + utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4); + utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4); + + float sum = 0; + for (int l = 0; l < n; ++l) { + sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4)) + + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4)) + + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4)) + + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4)); + sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4)) + + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4)) + + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4)) + + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4)); + } + tmp += d * sum; } - tmp += d * sum; } #else GGML_UNUSED(vx); @@ -588,7 +577,7 @@ static void dequantize_mul_mat_vec_q3_k_reorder(const void *__restrict__ vx, // sum up partial sums and write back result #pragma unroll - for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) { + for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { tmp += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); } @@ -598,13 +587,6 @@ static void dequantize_mul_mat_vec_q3_k_reorder(const void *__restrict__ vx, } } -/* -DPCT1110:6: The total declared local variable size in device function -dequantize_mul_mat_vec_q4_k exceeds 128 bytes and may cause high register -pressure. Consult with your hardware vendor to find the total register size -available and adjust the code, or use smaller sub-group size to avoid high -register pressure. -*/ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx, const float *__restrict__ yy, float *__restrict__ dst, @@ -625,22 +607,19 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx, const uint16_t kmask3 = 0xc0c0; const int tid = - item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16 + item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...7 or 0...15 const int ix = item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1 const int step = 8/K_QUANTS_PER_ITERATION; // 8 or 4 - const int il = tid/step; // 0...3 - const int ir = tid - step*il; // 0...7 or 0...3 + const int il_base = tid/step; // 0 or 1 (was 0...3) + const int ir = tid - step*il_base; // 0...7 or 0...3 const int n = 2 * K_QUANTS_PER_ITERATION; // 2 or 4 - const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224 - const int in = il%2; + const int in = il_base%2; const int l0 = n*(2*ir + in); - const int q_offset = 32*im + l0; - const int y_offset = 64*im + l0; uint16_t aux[4]; const uint8_t * sc = (const uint8_t *)aux; @@ -657,55 +636,60 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx, for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { - const float * y1 = yy + i*QK_K + y_offset; - const float * y2 = y1 + 128; - const float dall = x[i].dm[0]; const float dmin = x[i].dm[1]; - const uint16_t * a = (const uint16_t *)x[i].scales; - aux[0] = a[im+0] & kmask1; - aux[1] = a[im+2] & kmask1; - aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2); - aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2); + for (int im = 0; im < 2; ++im) { + const int q_offset = 32*im + l0; + const int y_offset = 64*im + l0; -#if K_QUANTS_PER_ITERATION == 2 - const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset); - const uint32_t * q2 = q1 + 16; + const float * y1 = yy + i*QK_K + y_offset; + const float * y2 = y1 + 128; - q32[0] = q1[0] & 0x0f0f0f0f; - q32[1] = q1[0] & 0xf0f0f0f0; - q32[2] = q2[0] & 0x0f0f0f0f; - q32[3] = q2[0] & 0xf0f0f0f0; + const uint16_t * a = (const uint16_t *)x[i].scales; + aux[0] = a[im+0] & kmask1; + aux[1] = a[im+2] & kmask1; + aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2); + aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2); - sycl::float4 s = {0.f, 0.f, 0.f, 0.f}; - float smin = 0; - for (int l = 0; l < 4; ++l) { - s.x() += y1[l] * q4[l + 0]; s.y() += y1[l + 32] * q4[l + 4]; - s.z() += y2[l] * q4[l + 8]; s.w() += y2[l + 32] * q4[l + 12]; - smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7]; - } - tmp += dall * (s.x() * sc[0] + s.y() * sc[1] * 1.f / 16.f + - s.z() * sc[4] + s.w() * sc[5] * 1.f / 16.f) - - dmin * smin; +#if K_QUANTS_PER_ITERATION == 2 + const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset); + const uint32_t * q2 = q1 + 16; + + q32[0] = q1[0] & 0x0f0f0f0f; + q32[1] = q1[0] & 0xf0f0f0f0; + q32[2] = q2[0] & 0x0f0f0f0f; + q32[3] = q2[0] & 0xf0f0f0f0; + + sycl::float4 s = {0.f, 0.f, 0.f, 0.f}; + float smin = 0; + for (int l = 0; l < 4; ++l) { + s.x() += y1[l] * q4[l + 0]; s.y() += y1[l + 32] * q4[l + 4]; + s.z() += y2[l] * q4[l + 8]; s.w() += y2[l + 32] * q4[l + 12]; + smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7]; + } + tmp += dall * (s.x() * sc[0] + s.y() * sc[1] * 1.f / 16.f + + s.z() * sc[4] + s.w() * sc[5] * 1.f / 16.f) - + dmin * smin; #else - const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset); - const uint16_t * q2 = q1 + 32; - - q16[0] = q1[0] & 0x0f0f; - q16[1] = q1[0] & 0xf0f0; - q16[2] = q2[0] & 0x0f0f; - q16[3] = q2[0] & 0xf0f0; - - float4 s = {0.f, 0.f, 0.f, 0.f}; - float smin = 0; - for (int l = 0; l < 2; ++l) { - s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2]; - s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6]; - smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7]; - } - tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin; + const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset); + const uint16_t * q2 = q1 + 32; + + q16[0] = q1[0] & 0x0f0f; + q16[1] = q1[0] & 0xf0f0; + q16[2] = q2[0] & 0x0f0f; + q16[3] = q2[0] & 0xf0f0; + + float4 s = {0.f, 0.f, 0.f, 0.f}; + float smin = 0; + for (int l = 0; l < 2; ++l) { + s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2]; + s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6]; + smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7]; + } + tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin; #endif + } } #else @@ -741,7 +725,7 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx, // sum up partial sums and write back result #pragma unroll - for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) { + for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { tmp += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); } @@ -776,22 +760,19 @@ static void dequantize_mul_mat_vec_q4_k_reorder(const void *__restrict__ vx, const uint16_t kmask3 = 0xc0c0; const int tid = - item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16 + item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...7 or 0...15 const int ix = item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1 const int step = 8/K_QUANTS_PER_ITERATION; // 8 or 4 - const int il = tid/step; // 0...3 - const int ir = tid - step*il; // 0...7 or 0...3 + const int il_base = tid/step; // 0 or 1 (was 0...3) + const int ir = tid - step*il_base; // 0...7 or 0...3 const int n = 2 * K_QUANTS_PER_ITERATION; // 2 or 4 - const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224 - const int in = il%2; + const int in = il_base%2; const int l0 = n*(2*ir + in); - const int q_offset = 32*im + l0; - const int y_offset = 64*im + l0; uint16_t aux[4]; const uint8_t * sc = (const uint8_t *)aux; @@ -809,56 +790,61 @@ static void dequantize_mul_mat_vec_q4_k_reorder(const void *__restrict__ vx, for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { const int bi = ib0 + i; - const float * y1 = yy + i*QK_K + y_offset; - const float * y2 = y1 + 128; - const sycl::half2 dm_val = dm_base[bi]; const float dall = dm_val[0]; const float dmin = dm_val[1]; - const uint16_t * a = (const uint16_t *)(scales_base + bi * K_SCALE_SIZE); - aux[0] = a[im+0] & kmask1; - aux[1] = a[im+2] & kmask1; - aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2); - aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2); + for (int im = 0; im < 2; ++im) { + const int q_offset = 32*im + l0; + const int y_offset = 64*im + l0; -#if K_QUANTS_PER_ITERATION == 2 - const uint32_t * q1 = (const uint32_t *)(qs_base + bi * (QK_K / 2) + q_offset); - const uint32_t * q2 = q1 + 16; + const float * y1 = yy + i*QK_K + y_offset; + const float * y2 = y1 + 128; - q32[0] = q1[0] & 0x0f0f0f0f; - q32[1] = q1[0] & 0xf0f0f0f0; - q32[2] = q2[0] & 0x0f0f0f0f; - q32[3] = q2[0] & 0xf0f0f0f0; + const uint16_t * a = (const uint16_t *)(scales_base + bi * K_SCALE_SIZE); + aux[0] = a[im+0] & kmask1; + aux[1] = a[im+2] & kmask1; + aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2); + aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2); - sycl::float4 s = {0.f, 0.f, 0.f, 0.f}; - float smin = 0; - for (int l = 0; l < 4; ++l) { - s.x() += y1[l] * q4[l + 0]; s.y() += y1[l + 32] * q4[l + 4]; - s.z() += y2[l] * q4[l + 8]; s.w() += y2[l + 32] * q4[l + 12]; - smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7]; - } - tmp += dall * (s.x() * sc[0] + s.y() * sc[1] * 1.f / 16.f + - s.z() * sc[4] + s.w() * sc[5] * 1.f / 16.f) - - dmin * smin; +#if K_QUANTS_PER_ITERATION == 2 + const uint32_t * q1 = (const uint32_t *)(qs_base + bi * (QK_K / 2) + q_offset); + const uint32_t * q2 = q1 + 16; + + q32[0] = q1[0] & 0x0f0f0f0f; + q32[1] = q1[0] & 0xf0f0f0f0; + q32[2] = q2[0] & 0x0f0f0f0f; + q32[3] = q2[0] & 0xf0f0f0f0; + + sycl::float4 s = {0.f, 0.f, 0.f, 0.f}; + float smin = 0; + for (int l = 0; l < 4; ++l) { + s.x() += y1[l] * q4[l + 0]; s.y() += y1[l + 32] * q4[l + 4]; + s.z() += y2[l] * q4[l + 8]; s.w() += y2[l + 32] * q4[l + 12]; + smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7]; + } + tmp += dall * (s.x() * sc[0] + s.y() * sc[1] * 1.f / 16.f + + s.z() * sc[4] + s.w() * sc[5] * 1.f / 16.f) - + dmin * smin; #else - const uint16_t * q1 = (const uint16_t *)(qs_base + bi * (QK_K / 2) + q_offset); - const uint16_t * q2 = q1 + 32; - - q16[0] = q1[0] & 0x0f0f; - q16[1] = q1[0] & 0xf0f0; - q16[2] = q2[0] & 0x0f0f; - q16[3] = q2[0] & 0xf0f0; - - float4 s = {0.f, 0.f, 0.f, 0.f}; - float smin = 0; - for (int l = 0; l < 2; ++l) { - s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2]; - s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6]; - smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7]; - } - tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin; + const uint16_t * q1 = (const uint16_t *)(qs_base + bi * (QK_K / 2) + q_offset); + const uint16_t * q2 = q1 + 32; + + q16[0] = q1[0] & 0x0f0f; + q16[1] = q1[0] & 0xf0f0; + q16[2] = q2[0] & 0x0f0f; + q16[3] = q2[0] & 0xf0f0; + + float4 s = {0.f, 0.f, 0.f, 0.f}; + float smin = 0; + for (int l = 0; l < 2; ++l) { + s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2]; + s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6]; + smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7]; + } + tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin; #endif + } } #else @@ -897,7 +883,7 @@ static void dequantize_mul_mat_vec_q4_k_reorder(const void *__restrict__ vx, // sum up partial sums and write back result #pragma unroll - for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) { + for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { tmp += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); } @@ -907,13 +893,6 @@ static void dequantize_mul_mat_vec_q4_k_reorder(const void *__restrict__ vx, } } -/* -DPCT1110:7: The total declared local variable size in device function -dequantize_mul_mat_vec_q5_k exceeds 128 bytes and may cause high register -pressure. Consult with your hardware vendor to find the total register size -available and adjust the code, or use smaller sub-group size to avoid high -register pressure. -*/ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx, const float *__restrict__ yy, float *__restrict__ dst, @@ -928,6 +907,141 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx, float tmp = 0; // partial sum for thread in warp +#if QK_K == 256 + const uint16_t kmask1 = 0x3f3f; + const uint16_t kmask2 = 0x0f0f; + const uint16_t kmask3 = 0xc0c0; + + const int tid = item_ct1.get_local_id(2) / 2; // 0...7 + const int ix = item_ct1.get_local_id(2) % 2; + + const int il_base = tid/4; // 0 or 1 (was 0...3) + const int ir = tid - 4*il_base;// 0...3 + const int n = 2; + + const int in = il_base%2; + + const int l0 = n*(2*ir + in); + + uint16_t aux[4]; + const uint8_t * sc = (const uint8_t *)aux; + + uint16_t q16[8]; + const uint8_t * q4 = (const uint8_t *)q16; + + for (int i = ix; i < num_blocks_per_row; i += 2) { + + const uint8_t * qh = x[i].qh + l0; + const float dall = x[i].dm[0]; + const float dmin = x[i].dm[1]; + + for (int im = 0; im < 2; ++im) { + const int q_offset = 32*im + l0; + const int y_offset = 64*im + l0; + + const uint8_t hm1 = 1 << (2*im); + const uint8_t hm2 = hm1 << 4; + + const uint8_t * ql1 = x[i].qs + q_offset; + const float * y1 = yy + i*QK_K + y_offset; + const float * y2 = y1 + 128; + + const uint16_t * a = (const uint16_t *)x[i].scales; + aux[0] = a[im+0] & kmask1; + aux[1] = a[im+2] & kmask1; + aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2); + aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2); + + sycl::float4 sum = {0.f, 0.f, 0.f, 0.f}; + float smin = 0; + const uint16_t * q1 = (const uint16_t *)ql1; + const uint16_t * q2 = q1 + 32; + q16[0] = q1[0] & 0x0f0f; + q16[1] = q1[8] & 0x0f0f; + q16[2] = (q1[0] >> 4) & 0x0f0f; + q16[3] = (q1[8] >> 4) & 0x0f0f; + q16[4] = q2[0] & 0x0f0f; + q16[5] = q2[8] & 0x0f0f; + q16[6] = (q2[0] >> 4) & 0x0f0f; + q16[7] = (q2[8] >> 4) & 0x0f0f; + for (int l = 0; l < n; ++l) { + sum.x() += + y1[l + 0] * (q4[l + 0] + (qh[l + 0] & (hm1 << 0) ? 16 : 0)) + + y1[l + 16] * (q4[l + 2] + (qh[l + 16] & (hm1 << 0) ? 16 : 0)); + sum.y() += + y1[l + 32] * (q4[l + 4] + (qh[l + 0] & (hm1 << 1) ? 16 : 0)) + + y1[l + 48] * (q4[l + 6] + (qh[l + 16] & (hm1 << 1) ? 16 : 0)); + sum.z() += + y2[l + 0] * (q4[l + 8] + (qh[l + 0] & (hm2 << 0) ? 16 : 0)) + + y2[l + 16] * (q4[l + 10] + (qh[l + 16] & (hm2 << 0) ? 16 : 0)); + sum.w() += + y2[l + 32] * (q4[l + 12] + (qh[l + 0] & (hm2 << 1) ? 16 : 0)) + + y2[l + 48] * (q4[l + 14] + (qh[l + 16] & (hm2 << 1) ? 16 : 0)); + smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3] + + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7]; + } + tmp += dall * (sum.x() * sc[0] + sum.y() * sc[1] + sum.z() * sc[4] + + sum.w() * sc[5]) - + dmin * smin; + } + } + +#else + const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15 + const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); + const int step = tid * K_QUANTS_PER_ITERATION; + const int im = step/8; + const int in = step%8; + + for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { + const uint8_t * q = x[i].qs + step; + const int8_t * s = x[i].scales; + const float * y = yy + i*QK_K + step; + const float d = x[i].d; + float sum = 0.f; + for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) { + const uint8_t h = x[i].qh[in+j] >> im; + sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16)) + + y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16)) + + y[j+32] * d * s[2] * ((q[j+ 0] >> 4) - ((h >> 4) & 1 ? 0 : 16)) + + y[j+48] * d * s[3] * ((q[j+16] >> 4) - ((h >> 6) & 1 ? 0 : 16)); + } + tmp += sum; + } +#endif + + // sum up partial sums and write back result +#pragma unroll + for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (item_ct1.get_local_id(2) == 0) { + dst[row] = tmp; + } +} + +static void dequantize_mul_mat_vec_q5_k_reorder(const void *__restrict__ vx, + const float *__restrict__ yy, + float *__restrict__ dst, + const int ncols, int nrows, + const sycl::nd_item<3> &item_ct1) { + + const int row = item_ct1.get_group(2); + const int num_blocks_per_row = ncols / QK_K; + const int ib0 = row*num_blocks_per_row; + + // SOA base pointers for the reordered layout: + // [qs: nb * QK_K/2] [qh: nb * QK_K/8] [scales: nb * K_SCALE_SIZE] [dm: nb * sizeof(half2)] + const int nb = nrows * num_blocks_per_row; + const uint8_t * qs_base = (const uint8_t *)vx; + const uint8_t * qh_base = qs_base + (size_t)nb * (QK_K / 2); + const uint8_t * scales_base = qh_base + (size_t)nb * (QK_K / 8); + const sycl::half2 * dm_base = (const sycl::half2 *)(scales_base + (size_t)nb * K_SCALE_SIZE); + + float tmp = 0; // partial sum for thread in warp + #if QK_K == 256 const uint16_t kmask1 = 0x3f3f; const uint16_t kmask2 = 0x0f0f; @@ -957,16 +1071,18 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx, const uint8_t * q4 = (const uint8_t *)q16; for (int i = ix; i < num_blocks_per_row; i += 2) { + const int bi = ib0 + i; - const uint8_t * ql1 = x[i].qs + q_offset; - const uint8_t * qh = x[i].qh + l0; + const uint8_t * ql1 = qs_base + bi * (QK_K / 2) + q_offset; + const uint8_t * qh = qh_base + bi * (QK_K / 8) + l0; const float * y1 = yy + i*QK_K + y_offset; const float * y2 = y1 + 128; - const float dall = x[i].dm[0]; - const float dmin = x[i].dm[1]; + const sycl::half2 dm_val = dm_base[bi]; + const float dall = dm_val[0]; + const float dmin = dm_val[1]; - const uint16_t * a = (const uint16_t *)x[i].scales; + const uint16_t * a = (const uint16_t *)(scales_base + bi * K_SCALE_SIZE); aux[0] = a[im+0] & kmask1; aux[1] = a[im+2] & kmask1; aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2); @@ -1004,29 +1120,8 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx, sum.w() * sc[5]) - dmin * smin; } - #else - const int tid = item_ct1.get_local_id(2)/(2*K_QUANTS_PER_ITERATION); // 0...15 - const int ix = item_ct1.get_local_id(2)%(2*K_QUANTS_PER_ITERATION); - const int step = tid * K_QUANTS_PER_ITERATION; - const int im = step/8; - const int in = step%8; - - for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { - const uint8_t * q = x[i].qs + step; - const int8_t * s = x[i].scales; - const float * y = yy + i*QK_K + step; - const float d = x[i].d; - float sum = 0.f; - for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) { - const uint8_t h = x[i].qh[in+j] >> im; - sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16)) - + y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16)) - + y[j+32] * d * s[2] * ((q[j+ 0] >> 4) - ((h >> 4) & 1 ? 0 : 16)) - + y[j+48] * d * s[3] * ((q[j+16] >> 4) - ((h >> 6) & 1 ? 0 : 16)); - } - tmp += sum; - } + // The reordered Q5_K layout is only produced for QK_K == 256. #endif // sum up partial sums and write back result @@ -1058,14 +1153,13 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa #if QK_K == 256 const int tid = - item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16 + item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...7 or 0...15 const int ix = item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0, 1 const int step = 16/K_QUANTS_PER_ITERATION; // 16 or 8 - const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... - const int in = tid - step*im; // 0...15 or 0...7 + const int in = tid % step; // 0...15 or 0...7 #if K_QUANTS_PER_ITERATION == 1 const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 @@ -1074,42 +1168,45 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa const int l0 = 4 * in; // 0, 4, 8, ..., 28 const int is = in / 4; #endif - const int ql_offset = 64*im + l0; - const int qh_offset = 32*im + l0; - const int s_offset = 8*im + is; - const int y_offset = 128*im + l0; float tmp = 0; // partial sum for thread in warp for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { - const float * y = yy + i * QK_K + y_offset; - const uint8_t * ql = x[i].ql + ql_offset; - const uint8_t * qh = x[i].qh + qh_offset; - const int8_t * s = x[i].scales + s_offset; - const float d = x[i].d; + for (int im = 0; im < 2; ++im) { + const int ql_offset = 64*im + l0; + const int qh_offset = 32*im + l0; + const int s_offset = 8*im + is; + const int y_offset = 128*im + l0; + + const float * y = yy + i * QK_K + y_offset; + const uint8_t * ql = x[i].ql + ql_offset; + const uint8_t * qh = x[i].qh + qh_offset; + const int8_t * s = x[i].scales + s_offset; + #if K_QUANTS_PER_ITERATION == 1 - float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32) - + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32) - + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32) - + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32) - + y[64] * s[4] * d * ((int8_t)((ql[ 0] >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32) - + y[80] * s[5] * d * ((int8_t)((ql[16] >> 4) | ((qh[16] & 0x30) >> 0)) - 32) - + y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32) - +y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32); - tmp += sum; + float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32) + + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32) + + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32) + + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32) + + y[64] * s[4] * d * ((int8_t)((ql[ 0] >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32) + + y[80] * s[5] * d * ((int8_t)((ql[16] >> 4) | ((qh[16] & 0x30) >> 0)) - 32) + + y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32) + +y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32); + tmp += sum; #else - float sum = 0; - for (int l = 0; l < 4; ++l) { - sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32) - + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32) - + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32) - + y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32); - } - tmp += sum; + float sum = 0; + for (int l = 0; l < 4; ++l) { + sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32) + + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32) + + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32) + + y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32); + } + tmp += sum; #endif + } } @@ -1146,7 +1243,7 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa // sum up partial sums and write back result #pragma unroll - for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) { + for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { tmp += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); } @@ -1179,14 +1276,13 @@ static void dequantize_mul_mat_vec_q6_k_reorder(const void * __restrict__ vx, co #if QK_K == 256 const int tid = - item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16 + item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...7 or 0...15 const int ix = item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0, 1 const int step = 16/K_QUANTS_PER_ITERATION; // 16 or 8 - const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... - const int in = tid - step*im; // 0...15 or 0...7 + const int in = tid % step; // 0...15 or 0...7 #if K_QUANTS_PER_ITERATION == 1 const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 @@ -1195,43 +1291,46 @@ static void dequantize_mul_mat_vec_q6_k_reorder(const void * __restrict__ vx, co const int l0 = 4 * in; // 0, 4, 8, ..., 28 const int is = in / 4; #endif - const int ql_offset = 64*im + l0; - const int qh_offset = 32*im + l0; - const int s_offset = 8*im + is; - const int y_offset = 128*im + l0; float tmp = 0; // partial sum for thread in warp for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { const int bi = ib0 + i; - const float * y = yy + i * QK_K + y_offset; - const uint8_t * ql = ql_base + bi * (QK_K / 2) + ql_offset; - const uint8_t * qh = qh_base + bi * (QK_K / 4) + qh_offset; - const int8_t * s = scales_base + bi * (QK_K / 16) + s_offset; - const float d = d_base[bi]; + for (int im = 0; im < 2; ++im) { + const int ql_offset = 64*im + l0; + const int qh_offset = 32*im + l0; + const int s_offset = 8*im + is; + const int y_offset = 128*im + l0; + + const float * y = yy + i * QK_K + y_offset; + const uint8_t * ql = ql_base + bi * (QK_K / 2) + ql_offset; + const uint8_t * qh = qh_base + bi * (QK_K / 4) + qh_offset; + const int8_t * s = scales_base + bi * (QK_K / 16) + s_offset; + #if K_QUANTS_PER_ITERATION == 1 - float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32) - + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32) - + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32) - + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32) - + y[64] * s[4] * d * ((int8_t)((ql[ 0] >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32) - + y[80] * s[5] * d * ((int8_t)((ql[16] >> 4) | ((qh[16] & 0x30) >> 0)) - 32) - + y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32) - +y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32); - tmp += sum; + float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32) + + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32) + + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32) + + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32) + + y[64] * s[4] * d * ((int8_t)((ql[ 0] >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32) + + y[80] * s[5] * d * ((int8_t)((ql[16] >> 4) | ((qh[16] & 0x30) >> 0)) - 32) + + y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32) + +y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32); + tmp += sum; #else - float sum = 0; - for (int l = 0; l < 4; ++l) { - sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32) - + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32) - + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32) - + y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32); - } - tmp += sum; + float sum = 0; + for (int l = 0; l < 4; ++l) { + sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32) + + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32) + + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32) + + y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32); + } + tmp += sum; #endif + } } @@ -1269,7 +1368,7 @@ static void dequantize_mul_mat_vec_q6_k_reorder(const void * __restrict__ vx, co // sum up partial sums and write back result #pragma unroll - for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) { + for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) { tmp += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); } @@ -1324,6 +1423,50 @@ static void dequantize_mul_mat_vec_q4_0_sycl(const void *vx, const dfloat *y, } } +static void dequantize_mul_mat_vec_q1_0_sycl_reorder(const void *vx, const dfloat *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + dequantize_mul_mat_vec_reorder( + vx, y, dst, ncols, nrows, item_ct1); + }); + } +} + +static void dequantize_mul_mat_vec_q1_0_sycl(const void *vx, const dfloat *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + dequantize_mul_mat_vec( + vx, y, dst, ncols, nrows, item_ct1); + }); + } +} + static void dequantize_mul_mat_vec_q4_1_sycl(const void *vx, const dfloat *y, float *dst, const int ncols, const int nrows, @@ -1513,10 +1656,10 @@ static void dequantize_mul_mat_vec_q2_K_sycl(const void *vx, const float *y, const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2 const int block_num_y = (nrows + ny - 1) / ny; const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE); + const sycl::range<3> block_dims(1, ny, WARP_SIZE); stream->parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] { + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { dequantize_mul_mat_vec_q2_k(vx, y, dst, ncols, nrows, item_ct1); }); } @@ -1529,10 +1672,10 @@ static void dequantize_mul_mat_vec_q3_K_sycl(const void *vx, const float *y, const int ny = 2 / K_QUANTS_PER_ITERATION; const int block_num_y = (nrows + ny - 1) / ny; const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE); + const sycl::range<3> block_dims(1, ny, WARP_SIZE); stream->parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] { + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { dequantize_mul_mat_vec_q3_k(vx, y, dst, ncols, nrows, item_ct1); }); } @@ -1545,10 +1688,10 @@ static void dequantize_mul_mat_vec_q3_K_sycl_reorder(const void *vx, const float const int ny = 2 / K_QUANTS_PER_ITERATION; const int block_num_y = (nrows + ny - 1) / ny; const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE); + const sycl::range<3> block_dims(1, ny, WARP_SIZE); stream->parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] { + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { dequantize_mul_mat_vec_q3_k_reorder(vx, y, dst, ncols, nrows, item_ct1); }); } @@ -1561,10 +1704,10 @@ static void dequantize_mul_mat_vec_q4_K_sycl(const void *vx, const float *y, const int ny = 2 / K_QUANTS_PER_ITERATION; const int block_num_y = (nrows + ny - 1) / ny; const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE); + const sycl::range<3> block_dims(1, ny, WARP_SIZE); stream->parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] { + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { dequantize_mul_mat_vec_q4_k(vx, y, dst, ncols, nrows, item_ct1); }); } @@ -1574,10 +1717,10 @@ static void dequantize_mul_mat_vec_q5_K_sycl(const void *vx, const float *y, const int nrows, dpct::queue_ptr stream) { GGML_ASSERT(ncols % QK_K == 0); - const sycl::range<3> block_dims(1, 1, QK_WARP_SIZE); + const sycl::range<3> block_dims(1, 1, WARP_SIZE); stream->parallel_for( sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] { + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { dequantize_mul_mat_vec_q5_k(vx, y, dst, ncols, item_ct1); }); } @@ -1590,10 +1733,10 @@ static void dequantize_mul_mat_vec_q6_K_sycl(const void *vx, const float *y, const int ny = 2 / K_QUANTS_PER_ITERATION; const int block_num_y = (nrows + ny - 1) / ny; const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE); + const sycl::range<3> block_dims(1, ny, WARP_SIZE); stream->parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] { + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { dequantize_mul_mat_vec_q6_k(vx, y, dst, ncols, nrows, item_ct1); }); } @@ -1606,14 +1749,27 @@ static void dequantize_mul_mat_vec_q4_K_sycl_reorder(const void *vx, const float const int ny = 2 / K_QUANTS_PER_ITERATION; const int block_num_y = (nrows + ny - 1) / ny; const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE); + const sycl::range<3> block_dims(1, ny, WARP_SIZE); stream->parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] { + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { dequantize_mul_mat_vec_q4_k_reorder(vx, y, dst, ncols, nrows, item_ct1); }); } +static void dequantize_mul_mat_vec_q5_K_sycl_reorder(const void *vx, const float *y, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const sycl::range<3> block_dims(1, 1, QK_WARP_SIZE); + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] { + dequantize_mul_mat_vec_q5_k_reorder(vx, y, dst, ncols, nrows, item_ct1); + }); +} + static void dequantize_mul_mat_vec_q6_K_sycl_reorder(const void *vx, const float *y, float *dst, const int ncols, const int nrows, @@ -1622,10 +1778,10 @@ static void dequantize_mul_mat_vec_q6_K_sycl_reorder(const void *vx, const float const int ny = 2 / K_QUANTS_PER_ITERATION; const int block_num_y = (nrows + ny - 1) / ny; const sycl::range<3> block_nums(1, 1, block_num_y); - const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE); + const sycl::range<3> block_dims(1, ny, WARP_SIZE); stream->parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(QK_WARP_SIZE)]] { + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { dequantize_mul_mat_vec_q6_k_reorder(vx, y, dst, ncols, nrows, item_ct1); }); } @@ -1647,6 +1803,7 @@ void ggml_sycl_op_dequantize_mul_mat_vec( sycl::half *src1_dfloat = nullptr; // dfloat == half bool src1_convert_f16 = + src0->type == GGML_TYPE_Q1_0 || src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 || src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 || src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16 || @@ -1665,6 +1822,14 @@ void ggml_sycl_op_dequantize_mul_mat_vec( #endif // GGML_SYCL_F16 switch (src0->type) { + case GGML_TYPE_Q1_0: + if ((ggml_tensor_extra_gpu*)dst->src[0]->extra && + ((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) { + dequantize_mul_mat_vec_q1_0_sycl_reorder(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + } else { + dequantize_mul_mat_vec_q1_0_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + } + break; case GGML_TYPE_Q4_0: if ((ggml_tensor_extra_gpu*)dst->src[0]->extra && ((ggml_tensor_extra_gpu*)dst->src[0]->extra)->optimized_feature.reorder) { @@ -1710,7 +1875,12 @@ void ggml_sycl_op_dequantize_mul_mat_vec( } break; case GGML_TYPE_Q5_K: - dequantize_mul_mat_vec_q5_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); + if ((ggml_tensor_extra_gpu *) dst->src[0]->extra && + ((ggml_tensor_extra_gpu *) dst->src[0]->extra)->optimized_feature.reorder) { + dequantize_mul_mat_vec_q5_K_sycl_reorder(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); + } else { + dequantize_mul_mat_vec_q5_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); + } break; case GGML_TYPE_Q6_K: if ((ggml_tensor_extra_gpu *) dst->src[0]->extra && diff --git a/ggml/src/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct/helper.hpp index 791d3cac52e1..664b8e9697f8 100644 --- a/ggml/src/ggml-sycl/dpct/helper.hpp +++ b/ggml/src/ggml-sycl/dpct/helper.hpp @@ -13,14 +13,14 @@ #ifndef GGML_SYCL_DPCT_HELPER_HPP #define GGML_SYCL_DPCT_HELPER_HPP +#include +#include +#include + #include #include #include -#include - -#include "ggml.h" - #if defined(__linux__) #include #elif defined(_WIN64) @@ -43,6 +43,7 @@ #include #endif + #define DPCT_COMPATIBILITY_TEMP (900) #if defined(_MSC_VER) @@ -59,6 +60,13 @@ #define __dpct_noinline__ __attribute__((noinline)) #endif +#define DPCT_UNUSED(x) (void)(x) + +inline void _abort(const char * str) { + std::cerr << str << std::endl; + std::abort(); +} + inline std::string get_device_type_name(const sycl::device &Device) { auto DeviceType = Device.get_info(); switch (DeviceType) { @@ -1017,7 +1025,7 @@ namespace dpct if (backend == "opencl:cpu") return 4; if (backend == "opencl:acc") return 5; printf("convert_backend_index: can't handle backend=%s\n", backend.c_str()); - GGML_ABORT("fatal error"); + _abort("fatal error"); } static bool compare_backend(std::string &backend1, std::string &backend2) { return convert_backend_index(backend1) < convert_backend_index(backend2); @@ -1426,7 +1434,7 @@ namespace dpct if (!size) return sycl::event{}; return q.memcpy(to_ptr, from_ptr, size, dep_events); - GGML_UNUSED(direction); + DPCT_UNUSED(direction); } // Get actual copy range and make sure it will not exceed range. @@ -2092,7 +2100,7 @@ namespace dpct if (!size) return sycl::event{}; return q.memcpy(to_ptr, from_ptr, size, dep_events); - GGML_UNUSED(direction); + DPCT_UNUSED(direction); } // Get actual copy range and make sure it will not exceed range. diff --git a/ggml/src/ggml-sycl/element_wise.cpp b/ggml/src/ggml-sycl/element_wise.cpp index 249e80c826ea..0c82ceb969f4 100644 --- a/ggml/src/ggml-sycl/element_wise.cpp +++ b/ggml/src/ggml-sycl/element_wise.cpp @@ -43,14 +43,44 @@ static __dpct_inline__ T op_sgn(T x) { return x > static_cast(0.f) ? static_cast(1.f) : ((x < static_cast(0.f) ? static_cast(-1.f) : static_cast(0.f))); } + template static __dpct_inline__ T op_abs(T x) { - return sycl::fabs(x); + if constexpr (std::is_same_v) { + return sycl::ext::oneapi::experimental::fabs(x); // or experimental namespace if needed + } else { + return sycl::fabs(x); + } +} + +template +static __dpct_inline__ T op_expm1(T x) { + if constexpr (std::is_same_v) { + return static_cast( + sycl::expm1(static_cast(x)) + ); + } else { + return sycl::expm1(x); + } } template static __dpct_inline__ T op_elu(T x) { - return (x > static_cast(0.f)) ? x : sycl::expm1(x); + return (x > static_cast(0.f)) ? x : op_expm1(x); +} + +template +static __dpct_inline__ T op_tanh(T x) { + if constexpr (std::is_same_v) { + constexpr int ver = __INTEL_LLVM_COMPILER; +#if defined(__INTEL_LLVM_COMPILER) && (__INTEL_LLVM_COMPILER >= 20260000) + return sycl::ext::oneapi::experimental::tanh(x); +#else + return static_cast(sycl::tanh(static_cast(x))); +#endif + } else { + return sycl::tanh(x); + } } template @@ -59,69 +89,106 @@ static __dpct_inline__ T op_gelu(T x) { const T SQRT_2_OVER_PI = static_cast(0.79788456080286535587989211986876f); return static_cast(0.5f) * x * (static_cast(1.0f) + - sycl::tanh(SQRT_2_OVER_PI * x * (static_cast(1.0f) + GELU_COEF_A * x * x))); + op_tanh(SQRT_2_OVER_PI * x * (static_cast(1.0f) + GELU_COEF_A * x * x))); +} + +template +static __dpct_inline__ T op_exp(T x) { + if constexpr (std::is_same_v) { + return sycl::ext::oneapi::experimental::exp(x); + } else { + return sycl::exp(x); + } } template static __dpct_inline__ T op_silu(T x) { - return x / (static_cast(1.0f) + sycl::native::exp(-x)); + return x / (static_cast(1.0f) + op_exp(-x)); } template -static __dpct_inline__ T op_gelu_quick(T x) { - const T GELU_QUICK_COEF_LOCAL = static_cast(-1.702f); - return x * (static_cast(1.0f) / (static_cast(1.0f) + sycl::native::exp(GELU_QUICK_COEF_LOCAL * x))); +static __dpct_inline__ T op_erf(T x) { + if constexpr (std::is_same_v) { + return static_cast( + sycl::erf(static_cast(x)) + ); + } else { + return sycl::erf(x); + } } template static __dpct_inline__ T op_gelu_erf(T x) { const T SQRT_2_INV = static_cast(0.70710678118654752440084436210484f); - return static_cast(0.5f) * x * (static_cast(1.0f) + sycl::erf(x * SQRT_2_INV)); + return static_cast(0.5f) * x * (static_cast(1.0f) + op_erf(x * SQRT_2_INV)); } template -static __dpct_inline__ T op_tanh(T x) { - return sycl::tanh(x); +static __dpct_inline__ T op_gelu_quick(T x) { + const T GELU_QUICK_COEF_LOCAL = static_cast(-1.702f); + return x * (static_cast(1.0f) / (static_cast(1.0f) + op_exp(GELU_QUICK_COEF_LOCAL * x))); } template static __dpct_inline__ T op_relu(T x) { - return sycl::fmax(x, static_cast(0)); + if constexpr (std::is_same_v) { + return sycl::ext::oneapi::experimental::fmax(x, static_cast(0)); + } else { + return sycl::fmax(x, static_cast(0)); + } } template static __dpct_inline__ T op_sigmoid(T x) { - return static_cast(1.0f) / (static_cast(1.0f) + sycl::native::exp(-x)); + return static_cast(1.0f) / (static_cast(1.0f) + op_exp(-x)); } template static __dpct_inline__ T op_sqrt(T x) { - return sycl::sqrt(x); + if constexpr (std::is_same_v) { + return sycl::ext::oneapi::experimental::sqrt(x); + } else { + return sycl::sqrt(x); + } } template static __dpct_inline__ T op_sin(T x) { - return sycl::sin(x); + if constexpr (std::is_same_v) { + return sycl::ext::oneapi::experimental::sin(x); + } else { + return sycl::sin(x); + } } template static __dpct_inline__ T op_cos(T x) { - return sycl::cos(x); + if constexpr (std::is_same_v) { + return sycl::ext::oneapi::experimental::cos(x); + } else { + return sycl::cos(x); + } } template static __dpct_inline__ T op_hardsigmoid(T x) { - return sycl::fmin(static_cast(1.0f), sycl::fmax(static_cast(0.0f), (x + static_cast(3.0f)) / static_cast(6.0f))); + if constexpr (std::is_same_v) { + return sycl::ext::oneapi::experimental::fmin( + static_cast(1.0f), sycl::ext::oneapi::experimental::fmax( + static_cast(0.0f), (x + static_cast(3.0f)) / static_cast(6.0f))); + } else { + return sycl::fmin(static_cast(1.0f), + sycl::fmax(static_cast(0.0f), (x + static_cast(3.0f)) / static_cast(6.0f))); + } } template static __dpct_inline__ T op_hardswish(T x) { - return x * sycl::fmin(static_cast(1.0f), sycl::fmax(static_cast(0.0f), (x + static_cast(3.0f)) / static_cast(6.0f))); -} - -template -static __dpct_inline__ T op_exp(T x) { - return sycl::exp(x); + if constexpr (std::is_same_v) { + return x * sycl::ext::oneapi::experimental::fmin(static_cast(1.0f), sycl::ext::oneapi::experimental::fmax(static_cast(0.0f), (x + static_cast(3.0f)) / static_cast(6.0f))); + } else { + return x * sycl::fmin(static_cast(1.0f), sycl::fmax(static_cast(0.0f), (x + static_cast(3.0f)) / static_cast(6.0f))); + } } template @@ -129,13 +196,17 @@ static __dpct_inline__ T op_log(T x) { if (x <= static_cast(0)) { return neg_infinity(); } - return sycl::log(x); + if constexpr (std::is_same_v) { + return sycl::ext::oneapi::experimental::log(x); + } else { + return sycl::log(x); + } } template static __dpct_inline__ T op_softplus(T x) { const float xf = (float) x; - const float ax = sycl::fabs(xf); + const float ax = op_abs(xf); const float m = sycl::fmax(xf, 0.0f); const float y = m + sycl::log1p(sycl::exp(-ax)); return (T) y; @@ -154,8 +225,14 @@ static __dpct_inline__ T op_step(T x) { template static __dpct_inline__ T op_leaky_relu(T x, float negative_slope) { T neg_slope_T = static_cast(negative_slope); - return sycl::fmax(x, static_cast(0)) + + if constexpr (std::is_same_v) { + return sycl::ext::oneapi::experimental::fmax(x, static_cast(0)) + + sycl::ext::oneapi::experimental::fmin(x, static_cast(0.0f)) * neg_slope_T; + + } else { + return sycl::fmax(x, static_cast(0)) + sycl::fmin(x, static_cast(0.0f)) * neg_slope_T; + } } template @@ -170,22 +247,40 @@ static __dpct_inline__ T op_clamp(T x, float min_val, float max_val) { template static __dpct_inline__ T op_floor(T x) { - return sycl::floor(x); + if constexpr (std::is_same_v) { + return sycl::ext::oneapi::experimental::floor(x); + } else { + return sycl::floor(x); + } } template static __dpct_inline__ T op_ceil(T x) { - return sycl::ceil(x); + if constexpr (std::is_same_v) { + return sycl::ext::oneapi::experimental::ceil(x); + } else { + return sycl::ceil(x); + } } template static __dpct_inline__ T op_round(T x) { - return sycl::round(x); + if constexpr (std::is_same_v) { + return static_cast( + sycl::round(static_cast(x)) + ); + } else { + return sycl::round(x); + } } template static __dpct_inline__ T op_trunc(T x) { - return sycl::trunc(x); + if constexpr (std::is_same_v) { + return sycl::ext::oneapi::experimental::trunc(x); + } else { + return sycl::trunc(x); + } } template @@ -266,13 +361,6 @@ static void unary_op_clamp_kernel(const T * x, T * dst, const int k, const sycl: } } -template -static void unary_op_floor_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { - SYCL_GLOBAL_ID_LOOP(k, item_ct1) { - dst[i] = op_floor(x[i]); - } -} - template static void unary_op_ceil_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { SYCL_GLOBAL_ID_LOOP(k, item_ct1) { @@ -280,20 +368,6 @@ static void unary_op_ceil_kernel(const T * x, T * dst, const int k, const sycl:: } } -template -static void unary_op_round_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { - SYCL_GLOBAL_ID_LOOP(k, item_ct1) { - dst[i] = op_round(x[i]); - } -} - -template -static void unary_op_trunc_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { - SYCL_GLOBAL_ID_LOOP(k, item_ct1) { - dst[i] = op_trunc(x[i]); - } -} - template static void clamp(const T * x, T * dst, const float min, const float max, const int k, const sycl::nd_item<1> &item_ct1) { @@ -355,7 +429,7 @@ static void acc_f32_sycl(const float *x, const float *y, float *dst, const int num_blocks = (n_elements + SYCL_ACC_BLOCK_SIZE - 1) / SYCL_ACC_BLOCK_SIZE; stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_ACC_BLOCK_SIZE), sycl::range<3>(1, 1, SYCL_ACC_BLOCK_SIZE)), - [=](sycl::nd_item<3> /*item_ct1*/) { + [=](sycl::nd_item<3> /*item_ct1*/) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { acc_f32(x, y, dst, n_elements, ne10, ne11, ne12, ne13, s1, s2, s3, offset); }); } @@ -370,8 +444,8 @@ static void arange_kernel(T * dst, const int k, T start, T step, template static inline void dispatch_ggml_sycl_op_unary(ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) { - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); - GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16 || dst->src[0]->type == GGML_TYPE_BF16); + GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_BF16); GGML_ASSERT(dst->src[0]->type == dst->type); dpct::queue_ptr main_stream = ctx.stream(); @@ -383,6 +457,14 @@ static inline void dispatch_ggml_sycl_op_unary(ggml_backend_sycl_context & ctx, kernel_invoker(data_pts.src, data_pts.dst, (int)ggml_nelements(dst->src[0]), main_stream, std::forward(args)...); break; } +#ifdef GGML_SYCL_HAS_BF16 + case GGML_TYPE_BF16: + { + auto data_pts = cast_data(dst); + kernel_invoker(data_pts.src, data_pts.dst, (int)ggml_nelements(dst->src[0]), main_stream, std::forward(args)...); + break; + } +#endif case GGML_TYPE_F32: { auto data_pts = cast_data(dst); @@ -496,7 +578,7 @@ static inline void ggml_sycl_op_unary( stream->parallel_for( sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256), sycl::range<1>(256)), - [=](sycl::nd_item<1> item_ct1) { + [=](sycl::nd_item<1> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { unary_op_generic_kernel( src, dst_ptr, k_elements, ne0, ne1, ne2, ne3, @@ -524,7 +606,7 @@ static inline void ggml_sycl_op_arange(ggml_backend_sycl_context & ctx, ggml_ten stream->parallel_for( sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_ARANGE_BLOCK_SIZE), sycl::range<1>(SYCL_ARANGE_BLOCK_SIZE)), - [=](sycl::nd_item<1> item_ct1) { + [=](sycl::nd_item<1> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { arange_kernel(dst_ptr, k, start, step, item_ct1); }); } @@ -605,6 +687,12 @@ static inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, ggml_tensor }); } +static inline void ggml_sycl_op_expm1(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) { + return op_expm1(x); + }); +} + static inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { @@ -612,7 +700,7 @@ static inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, ggml_tensor stream->parallel_for( sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_EXP_BLOCK_SIZE), sycl::range<1>(SYCL_EXP_BLOCK_SIZE)), - [=](sycl::nd_item<1> item_ct1) { + [=](sycl::nd_item<1> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { unary_op_log_kernel(src, dst_ptr, k_elements, item_ct1); }); }); @@ -650,7 +738,7 @@ static inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, ggml_tenso stream->parallel_for( sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SQRT_BLOCK_SIZE), sycl::range<1>(SYCL_SQRT_BLOCK_SIZE)), - [=](sycl::nd_item<1> item_ct1) { + [=](sycl::nd_item<1> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { unary_op_sqrt_kernel(src, dst_ptr, k_elements, item_ct1); }); }); @@ -663,7 +751,7 @@ static inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, ggml_tensor stream->parallel_for( sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SIN_BLOCK_SIZE), sycl::range<1>(SYCL_SIN_BLOCK_SIZE)), - [=](sycl::nd_item<1> item_ct1) { + [=](sycl::nd_item<1> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { unary_op_sin_kernel(src, dst_ptr, k_elements, item_ct1); }); }); @@ -676,7 +764,7 @@ static inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, ggml_tensor stream->parallel_for( sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SIN_BLOCK_SIZE), sycl::range<1>(SYCL_SIN_BLOCK_SIZE)), - [=](sycl::nd_item<1> item_ct1) { + [=](sycl::nd_item<1> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { unary_op_cos_kernel(src, dst_ptr, k_elements, item_ct1); }); }); @@ -691,7 +779,7 @@ static inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, ggml stream->parallel_for( sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_RELU_BLOCK_SIZE), sycl::range<1>(SYCL_RELU_BLOCK_SIZE)), - [=](sycl::nd_item<1> item_ct1) { + [=](sycl::nd_item<1> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { unary_op_leaky_relu_kernel(src, dst_ptr, k_elements, slope, item_ct1); }); }, negative_slope); @@ -704,7 +792,7 @@ static inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, ggml_tensor stream->parallel_for( sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SQR_BLOCK_SIZE), sycl::range<1>(SYCL_SQR_BLOCK_SIZE)), - [=](sycl::nd_item<1> item_ct1) { + [=](sycl::nd_item<1> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { unary_op_sqr_kernel(src, dst_ptr, k_elements, item_ct1); }); }); @@ -721,23 +809,16 @@ static inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, ggml_tens stream->parallel_for( sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_CLAMP_BLOCK_SIZE), sycl::range<1>(SYCL_CLAMP_BLOCK_SIZE)), - [=](sycl::nd_item<1> item_ct1) { + [=](sycl::nd_item<1> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { clamp(src, dst_ptr, min_arg, max_arg, k_elements, item_ct1); }); }, min_val, max_val); } static inline void ggml_sycl_op_floor(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, - [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { - const int num_blocks = ceil_div(k_elements, 256); - stream->parallel_for( - sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256), - sycl::range<1>(256)), - [=](sycl::nd_item<1> item_ct1) { - unary_op_floor_kernel(src, dst_ptr, k_elements, item_ct1); - }); - }); + ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) { + return op_floor(x); + }); } static inline void ggml_sycl_op_ceil(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { @@ -747,29 +828,15 @@ static inline void ggml_sycl_op_ceil(ggml_backend_sycl_context & ctx, ggml_tenso } static inline void ggml_sycl_op_round(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, - [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { - const int num_blocks = ceil_div(k_elements, 256); - stream->parallel_for( - sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256), - sycl::range<1>(256)), - [=](sycl::nd_item<1> item_ct1) { - unary_op_round_kernel(src, dst_ptr, k_elements, item_ct1); - }); - }); + ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) { + return op_round(x); + }); } static inline void ggml_sycl_op_trunc(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, - [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { - const int num_blocks = ceil_div(k_elements, 256); - stream->parallel_for( - sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256), - sycl::range<1>(256)), - [=](sycl::nd_item<1> item_ct1) { - unary_op_trunc_kernel(src, dst_ptr, k_elements, item_ct1); - }); - }); + ggml_sycl_detail::ggml_sycl_op_unary(ctx, dst, [](auto x) { + return op_trunc(x); + }); } static inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { @@ -805,7 +872,8 @@ static inline void ggml_sycl_op_geglu(ggml_backend_sycl_context & ctx, ggml_tens [](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) { const uint32_t num_blocks = ceil_div(k, SYCL_GELU_BLOCK_SIZE); main_stream->parallel_for( - sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) { + sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), + sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { gated_op_fused_geglu(x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1); }); }); @@ -816,7 +884,8 @@ static inline void ggml_sycl_op_reglu(ggml_backend_sycl_context & ctx, ggml_tens [](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) { const uint32_t num_blocks = ceil_div((uint32_t)k, SYCL_RELU_BLOCK_SIZE); // Using RELU block size for reglu main_stream->parallel_for( - sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_RELU_BLOCK_SIZE)), sycl::range<1>(SYCL_RELU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) { + sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_RELU_BLOCK_SIZE)), + sycl::range<1>(SYCL_RELU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { gated_op_fused_reglu(x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1); }); }); @@ -827,7 +896,8 @@ static inline void ggml_sycl_op_swiglu(ggml_backend_sycl_context & ctx, ggml_ten [](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) { const uint32_t num_blocks = ceil_div((uint32_t)k, SYCL_SILU_BLOCK_SIZE); // Using SILU block size for swiglu main_stream->parallel_for( - sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_SILU_BLOCK_SIZE)), sycl::range<1>(SYCL_SILU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) { + sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_SILU_BLOCK_SIZE)), + sycl::range<1>(SYCL_SILU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { gated_op_fused_swiglu(x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1); }); }); @@ -842,7 +912,6 @@ __dpct_inline__ float ggml_sycl_op_swiglu_oai_single(float x, float g, float alp return out_glu; } - template static void swiglu_oai_kernel(const T * x, const T * g, T * dst, const int64_t k, const int64_t n, const int64_t o0, const int64_t o1, @@ -876,7 +945,7 @@ static void swiglu_oai_sycl(const T * x, const int64_t num_blocks = (k + SYCL_GLU_BLOCK_SIZE - 1) / SYCL_GLU_BLOCK_SIZE; stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, SYCL_GLU_BLOCK_SIZE), sycl::range<3>(1, 1, SYCL_GLU_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { swiglu_oai_kernel(x, g, dst, k, n, o0, o1, alpha, limit, item_ct1); }); } @@ -930,7 +999,8 @@ static inline void ggml_sycl_op_geglu_erf(ggml_backend_sycl_context & ctx, ggml_ [](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) { const uint32_t num_blocks = ceil_div(k, SYCL_GELU_BLOCK_SIZE); main_stream->parallel_for( - sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) { + sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), + sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { gated_op_fused_geglu_erf(x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1); }); }); @@ -941,7 +1011,8 @@ static inline void ggml_sycl_op_geglu_quick(ggml_backend_sycl_context & ctx, ggm [](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) { const uint32_t num_blocks = ceil_div(k, SYCL_GELU_BLOCK_SIZE); main_stream->parallel_for( - sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) { + sycl::nd_range<1>((num_blocks * sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), + sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), [=](sycl::nd_item<1> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { gated_op_fused_geglu_quick(x_ptr, g_ptr, dst_ptr, k, n, o0, o1, item_ct1); }); }); @@ -1018,6 +1089,11 @@ void ggml_sycl_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { ggml_sycl_op_exp(ctx, dst); } +void ggml_sycl_expm1(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); + ggml_sycl_op_expm1(ctx, dst); +} + void ggml_sycl_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); ggml_sycl_op_log(ctx, dst); diff --git a/ggml/src/ggml-sycl/element_wise.hpp b/ggml/src/ggml-sycl/element_wise.hpp index 997132166ab2..3bdc38596819 100644 --- a/ggml/src/ggml-sycl/element_wise.hpp +++ b/ggml/src/ggml-sycl/element_wise.hpp @@ -59,6 +59,8 @@ void ggml_sycl_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor * dst); void ggml_sycl_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst); +void ggml_sycl_expm1(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + void ggml_sycl_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst); void ggml_sycl_softplus(ggml_backend_sycl_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 3f246e8672d5..41449db665ec 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -32,7 +32,7 @@ #include #include -#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO +#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API #include #endif #if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC @@ -62,6 +62,9 @@ #include "ggml-sycl/repeat_back.hpp" #include "ggml-sycl/set_rows.hpp" #include "ggml-sycl/set.hpp" +#include "ggml-sycl/conv2d.hpp" +#include "ggml-sycl/conv2d-dw.hpp" +#include "ggml-sycl/conv2d-transpose.hpp" #include "ggml-sycl/ssm_conv.hpp" #include "ggml-sycl/sycl_hw.hpp" #include "ggml-sycl/ssm_scan.hpp" @@ -70,6 +73,10 @@ #include "ggml-sycl/diag.hpp" #include "ggml-sycl/solve_tri.hpp" #include "ggml-sycl/gated_delta_net.hpp" +#include "ggml-sycl/pool.hpp" + +#define MEM_SIZE_2M 0x00200000 +#define MEM_SIZE_1G 0x40000000 static bool g_sycl_loaded = false; int g_ggml_sycl_debug = 0; @@ -80,9 +87,10 @@ int g_ggml_sycl_enable_vmm = 1; int g_ggml_sycl_prioritize_dmmv = 0; int g_ggml_sycl_use_async_mem_op = 0; int g_ggml_sycl_use_async_mem_op_requested = 1; -int g_ggml_sycl_enable_level_zero = 0; +int g_ggml_sycl_use_level_zero_api = 0; int g_ggml_sycl_enable_flash_attention = 1; - +int g_ggml_sycl_dev2dev_memcpy = DEV2DEV_MEMCPY_SYCL; +int g_ggml_sycl_usm_system = 0; static ggml_sycl_device_info ggml_sycl_init() { ggml_sycl_device_info info = {}; @@ -136,6 +144,7 @@ static ggml_sycl_device_info ggml_sycl_init() { info.devices[i].opt_feature.reorder = device.ext_oneapi_architecture_is(syclex::arch_category::intel_gpu); info.devices[i].smpbo = prop.get_local_mem_size(); info.devices[i].warp_size = WARP_SIZE; + info.devices[i].usm_system_support = device.has(sycl::aspect::usm_system_allocations); info.max_work_group_sizes[i] = prop.get_max_work_group_size(); info.devices[i].max_wg_per_cu = info.max_work_group_sizes[i] / prop.get_max_compute_units(); @@ -147,11 +156,31 @@ static ggml_sycl_device_info ggml_sycl_init() { GGML_LOG_WARN("SYCL GPU device %d does not use Level Zero backend, disabling Level Zero memory API\n", i); info.ext_oneapi_level_zero = false; } + +#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API + if (info.ext_oneapi_level_zero && device.is_gpu() && device.default_queue().get_backend() == sycl::backend::ext_oneapi_level_zero) { + ze_device_handle_t ze_dev = sycl::get_native(device.default_queue().get_device()); + ze_device_properties_t props = {}; + props.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES; + ze_result_t r = zeDeviceGetProperties(ze_dev, &props); + info.devices[i].l0_discrete_gpu = r == ZE_RESULT_SUCCESS && !(props.flags & ZE_DEVICE_PROPERTY_FLAG_INTEGRATED); + } +#endif } for (int id = 0; id < info.device_count; ++id) { info.default_tensor_split[id] /= total_vram; } + +#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API + // Large buffers can be allocated before ggml_check_sycl() initializes other + // g_ggml_sycl_enable_* globals, so initialize this one as early as we can. + g_ggml_sycl_use_level_zero_api = + info.ext_oneapi_level_zero && ggml_sycl_get_env("GGML_SYCL_USE_LEVEL_ZERO_API", 1); +#else + g_ggml_sycl_use_level_zero_api = 0; +#endif + return info; } @@ -236,42 +265,30 @@ void ggml_backend_sycl_print_sycl_devices() { print_device_opt_feature(device_count); } -static inline int get_sycl_env(const char *env_name, int default_val) { - char *user_device_string = getenv(env_name); - int user_number = default_val; - - unsigned n; - if (user_device_string != NULL && - sscanf(user_device_string, " %u", &n) == 1) { - user_number = (int)n; - } else { - user_number = default_val; - } - return user_number; -} - static void ggml_check_sycl() try { static bool initialized = false; if (!initialized) { - g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0); - g_ggml_sycl_disable_optimize = get_sycl_env("GGML_SYCL_DISABLE_OPT", 0); - g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1); - g_ggml_sycl_disable_dnn = get_sycl_env("GGML_SYCL_DISABLE_DNN", 0); - g_ggml_sycl_enable_vmm = get_sycl_env("GGML_SYCL_ENABLE_VMM", 1); - g_ggml_sycl_prioritize_dmmv = get_sycl_env("GGML_SYCL_PRIORITIZE_DMMV", 0); -#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO - g_ggml_sycl_enable_level_zero = get_sycl_env("GGML_SYCL_ENABLE_LEVEL_ZERO", ggml_sycl_info().ext_oneapi_level_zero); -#else - g_ggml_sycl_enable_level_zero = 0; -#endif + g_ggml_sycl_debug = ggml_sycl_get_env("GGML_SYCL_DEBUG", 0); + g_ggml_sycl_disable_optimize = ggml_sycl_get_env("GGML_SYCL_DISABLE_OPT", 0); + g_ggml_sycl_disable_graph = ggml_sycl_get_env("GGML_SYCL_DISABLE_GRAPH", 1); + g_ggml_sycl_disable_dnn = ggml_sycl_get_env("GGML_SYCL_DISABLE_DNN", 0); + g_ggml_sycl_enable_vmm = ggml_sycl_get_env("GGML_SYCL_ENABLE_VMM", 1); + g_ggml_sycl_prioritize_dmmv = ggml_sycl_get_env("GGML_SYCL_PRIORITIZE_DMMV", 0); + + g_ggml_sycl_dev2dev_memcpy = ggml_sycl_get_env("GGML_SYCL_DEV2DEV_MEMCPY", DEV2DEV_MEMCPY_SYCL); + if (g_ggml_sycl_use_level_zero_api == 0) { + g_ggml_sycl_dev2dev_memcpy = DEV2DEV_MEMCPY_SYCL; + } #ifdef SYCL_FLASH_ATTN - g_ggml_sycl_enable_flash_attention = get_sycl_env("GGML_SYCL_ENABLE_FLASH_ATTN", 1); + g_ggml_sycl_enable_flash_attention = ggml_sycl_get_env("GGML_SYCL_ENABLE_FLASH_ATTN", 1); #else g_ggml_sycl_enable_flash_attention = 0; #endif + g_ggml_sycl_usm_system = ggml_sycl_get_env("GGML_SYCL_USM_SYSTEM", 0); + GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n"); GGML_LOG_INFO("Build with Macros:\n"); @@ -295,10 +312,10 @@ static void ggml_check_sycl() try { #else GGML_LOG_INFO(" GGML_SYCL_DNNL: no\n"); #endif -#if defined(GGML_SYCL_SUPPORT_LEVEL_ZERO) - GGML_LOG_INFO(" GGML_SYCL_SUPPORT_LEVEL_ZERO: yes\n"); +#if defined(GGML_SYCL_SUPPORT_LEVEL_ZERO_API) + GGML_LOG_INFO(" GGML_SYCL_SUPPORT_LEVEL_ZERO_API: yes\n"); #else - GGML_LOG_INFO(" GGML_SYCL_SUPPORT_LEVEL_ZERO: no\n"); + GGML_LOG_INFO(" GGML_SYCL_SUPPORT_LEVEL_ZERO_API: no\n"); #endif #if defined(GGML_SYCL_USE_VMM) GGML_LOG_INFO(" GGML_SYCL_USE_VMM: yes\n"); @@ -314,10 +331,13 @@ static void ggml_check_sycl() try { #else GGML_LOG_INFO(" GGML_SYCL_DISABLE_GRAPH: graph disabled by compile flag\n"); #endif -#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO - GGML_LOG_INFO(" GGML_SYCL_ENABLE_LEVEL_ZERO: %d\n", g_ggml_sycl_enable_level_zero); +#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API + GGML_LOG_INFO(" GGML_SYCL_USE_LEVEL_ZERO_API: %d\n", g_ggml_sycl_use_level_zero_api); + GGML_LOG_INFO(" GGML_SYCL_DEV2DEV_MEMCPY: %d\n", g_ggml_sycl_dev2dev_memcpy); #else - GGML_LOG_INFO(" GGML_SYCL_ENABLE_LEVEL_ZERO: Level Zero disabled by compile flag\n"); + GGML_LOG_INFO(" GGML_SYCL_USE_LEVEL_ZERO_API: Disable Level Zero API usage by compile flag\n"); + GGML_LOG_INFO(" GGML_SYCL_DEV2DEV_MEMCPY: %d, enable to SYCL API since missing GGML_SYCL_SUPPORT_LEVEL_ZERO_API\n", + g_ggml_sycl_dev2dev_memcpy); #endif #if GGML_SYCL_DNNL GGML_LOG_INFO(" GGML_SYCL_DISABLE_DNN: %d\n", g_ggml_sycl_disable_dnn); @@ -330,7 +350,7 @@ static void ggml_check_sycl() try { GGML_LOG_INFO(" GGML_SYCL_ENABLE_VMM: virtual memory extension is not available\n"); #endif GGML_LOG_INFO(" GGML_SYCL_PRIORITIZE_DMMV: %d\n", g_ggml_sycl_prioritize_dmmv); - g_ggml_sycl_use_async_mem_op_requested = get_sycl_env("GGML_SYCL_USE_ASYNC_MEM_OP", 1); + g_ggml_sycl_use_async_mem_op_requested = ggml_sycl_get_env("GGML_SYCL_USE_ASYNC_MEM_OP", 1); GGML_LOG_INFO(" GGML_SYCL_USE_ASYNC_MEM_OP: %d\n", g_ggml_sycl_use_async_mem_op_requested); #ifdef SYCL_FLASH_ATTN @@ -340,6 +360,8 @@ static void ggml_check_sycl() try { g_ggml_sycl_enable_flash_attention); #endif + GGML_LOG_INFO(" GGML_SYCL_USM_SYSTEM: %d\n", g_ggml_sycl_usm_system); + /* NOT REMOVE, keep it for next optimize for XMX. #if defined(SYCL_USE_XMX) fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__); @@ -415,6 +437,14 @@ catch (sycl::exception const &exc) { std::exit(1); } +inline void free_aligned_mem_host(void * memblock) { +#ifdef _WIN32 + _aligned_free(memblock); +#else + free(memblock); +#endif +} + // sycl buffer struct ggml_backend_sycl_buffer_context { @@ -424,9 +454,10 @@ struct ggml_backend_sycl_buffer_context { std::string name; optimize_feature opt_feature; std::vector tensor_extras; + bool is_usm_system; - ggml_backend_sycl_buffer_context(int device, void * dev_ptr, queue_ptr stream) : - device(device), dev_ptr(dev_ptr), stream(stream) { + ggml_backend_sycl_buffer_context(int device, void * dev_ptr, queue_ptr stream, bool is_usm_system) : + device(device), dev_ptr(dev_ptr), stream(stream), is_usm_system(is_usm_system) { check_allow_gpu_index(device); name = (GGML_SYCL_NAME + std::to_string(device)); opt_feature = ggml_sycl_info().devices[device].opt_feature; @@ -435,7 +466,10 @@ struct ggml_backend_sycl_buffer_context { ~ggml_backend_sycl_buffer_context() { if (dev_ptr != nullptr) { ggml_sycl_set_device(device); - SYCL_CHECK(CHECK_TRY_ERROR(ggml_sycl_free_device(dev_ptr, *stream))); + if (is_usm_system) + free_aligned_mem_host(dev_ptr); + else + SYCL_CHECK(CHECK_TRY_ERROR(ggml_sycl_free_device(dev_ptr, *stream))); } //release extra used by tensors @@ -568,43 +602,50 @@ catch (sycl::exception const &exc) { std::exit(1); } -#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO -static bool ggml_sycl_is_l0_discrete_gpu(sycl::queue &q) { - if (!q.get_device().is_gpu() || q.get_backend() != sycl::backend::ext_oneapi_level_zero) { - return false; - } - - ze_device_handle_t ze_dev = sycl::get_native(q.get_device()); - ze_device_properties_t props = {}; - props.stype = ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES; - ze_result_t r = zeDeviceGetProperties(ze_dev, &props); - return r == ZE_RESULT_SUCCESS && !(props.flags & ZE_DEVICE_PROPERTY_FLAG_INTEGRATED); +#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API +static bool ggml_sycl_is_l0_discrete_gpu(int device) { + return ggml_sycl_info().devices[device].l0_discrete_gpu; } #endif -static void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst, +static void dev2dev_memcpy(int device_dst, sycl::queue &q_dst, int device_src, sycl::queue &q_src, void *ptr_dst, const void *ptr_src, size_t size) { -#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO - // Use Level Zero direct copy for dGPU-to-dGPU transfers. - const bool l0_copy_supported = - ggml_sycl_is_l0_discrete_gpu(q_dst) && ggml_sycl_is_l0_discrete_gpu(q_src); - if (g_ggml_sycl_enable_level_zero && l0_copy_supported) { - auto ze_ctx = sycl::get_native(q_dst.get_context()); - auto ze_dev = sycl::get_native(q_dst.get_device()); - ze_command_queue_desc_t cq_desc = {ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, nullptr, 0, 0, - 0, ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS, ZE_COMMAND_QUEUE_PRIORITY_NORMAL}; - ze_command_list_handle_t cl; - ze_result_t r = zeCommandListCreateImmediate(ze_ctx, ze_dev, &cq_desc, &cl); - if (r == ZE_RESULT_SUCCESS) { - r = zeCommandListAppendMemoryCopy(cl, ptr_dst, ptr_src, size, nullptr, 0, nullptr); - zeCommandListDestroy(cl); + +#ifdef GGML_SYCL_SUPPORT_LEVEL_ZERO_API + if (g_ggml_sycl_dev2dev_memcpy == DEV2DEV_MEMCPY_L0) { + // Use Level Zero direct copy for dGPU-to-dGPU transfers. + const bool l0_copy_supported = + ggml_sycl_is_l0_discrete_gpu(device_dst) && ggml_sycl_is_l0_discrete_gpu(device_src); + if (g_ggml_sycl_use_level_zero_api && l0_copy_supported) { + auto ze_ctx = sycl::get_native(q_dst.get_context()); + auto ze_dev = sycl::get_native(q_dst.get_device()); + ze_command_queue_desc_t cq_desc = {ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, nullptr, 0, 0, + 0, ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS, ZE_COMMAND_QUEUE_PRIORITY_NORMAL}; + ze_command_list_handle_t cl; + ze_result_t r = zeCommandListCreateImmediate(ze_ctx, ze_dev, &cq_desc, &cl); if (r == ZE_RESULT_SUCCESS) { - return; + GGML_SYCL_DEBUG("[SYCL] dev2dev memcpy by L0\n"); + r = zeCommandListAppendMemoryCopy(cl, ptr_dst, ptr_src, size, nullptr, 0, nullptr); + zeCommandListDestroy(cl); + if (r == ZE_RESULT_SUCCESS) { + return; + } } } } #endif + + if (g_ggml_sycl_dev2dev_memcpy == DEV2DEV_MEMCPY_SYCL) { + if (q_dst.get_device().ext_oneapi_can_access_peer(q_src.get_device(), + sycl::ext::oneapi::peer_access::access_supported)) { + GGML_SYCL_DEBUG("[SYCL] dev2dev memcpy by SYCL\n"); + SYCL_CHECK(CHECK_TRY_ERROR(q_dst.memcpy(ptr_dst, ptr_src, size).wait())); + return; + } + } + // Host-staged copy + GGML_SYCL_DEBUG("[SYCL] dev2dev memcpy by host forward\n"); char *host_buf = (char *)malloc(size); q_src.memcpy(host_buf, (const char *)ptr_src, size).wait(); q_dst.memcpy((char *)ptr_dst, host_buf, size).wait(); @@ -651,7 +692,7 @@ ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer, size_t size = ggml_nbytes(src); //todo. it's dirty solutino to walkaroud known issue:device2device cross GPUs. - dev2dev_memcpy(*stream_dst, *stream_src, dst->data, src->data, size); + dev2dev_memcpy(dst_ctx->device, *stream_dst, src_ctx->device, *stream_src, dst->data, src->data, size); //todo, it's known issue:error in device2device cross GPUs. reused when the issue is fixed. DON"T remove #if 0 @@ -765,21 +806,59 @@ static const char * ggml_backend_sycl_buffer_type_get_name(ggml_backend_buffer_t return ctx->name.c_str(); } +static bool check_usm_system(int device, size_t size) { + bool use_usm_system = g_ggml_sycl_usm_system && size >= MEM_SIZE_1G; + + if (use_usm_system && !ggml_sycl_info().devices[device].usm_system_support) { + GGML_LOG_INFO("Device does not support USM system allocations\n"); + use_usm_system = false; + } + + return use_usm_system; +} + +inline void * aligned_malloc_host(size_t alignment, size_t size) { +#ifdef _WIN32 + return _aligned_malloc(size, alignment); +#else + return aligned_alloc(alignment, size); +#endif +} + static ggml_backend_buffer_t ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) try { + ggml_check_sycl(); + ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context; ggml_sycl_set_device(buft_ctx->device); const queue_ptr stream = buft_ctx->stream; size = std::max(size, (size_t)1); // syclMalloc returns null for size 0 + /* + Alignment below ensures best performance. While in theory it could lead to + wasting memory, this is acceptable because in practice only few buffers are + allocated and even less exceed the minimum size accepted here for USM system + allocations. + */ + size_t alignment = MEM_SIZE_2M; + size_t aligned_size = ((size + alignment - 1) / alignment) * alignment; + bool use_usm_system = check_usm_system(buft_ctx->device, aligned_size); void * dev_ptr; - SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)ggml_sycl_malloc_device(size, *stream))); - if (!dev_ptr) { - GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device\n", __func__, size); - return nullptr; + if (use_usm_system) { + dev_ptr = (void *)aligned_malloc_host(alignment, aligned_size); + if (!dev_ptr) { + GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on host\n", __func__, size); + return nullptr; + } + } else { + SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)ggml_sycl_malloc_device(size, *stream))); + if (!dev_ptr) { + GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device\n", __func__, size); + return nullptr; + } } - ggml_backend_sycl_buffer_context * ctx = new ggml_backend_sycl_buffer_context(buft_ctx->device, dev_ptr, buft_ctx->stream); + ggml_backend_sycl_buffer_context * ctx = new ggml_backend_sycl_buffer_context(buft_ctx->device, dev_ptr, buft_ctx->stream, use_usm_system); return ggml_backend_buffer_init(buft, ggml_backend_sycl_buffer_interface, ctx, size); } catch (sycl::exception const &exc) { @@ -897,6 +976,7 @@ static int64_t get_row_rounding(ggml_type type, const std::array= VER_GEN9 ? 128 : 64; @@ -1306,22 +1386,6 @@ static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_ GGML_UNUSED(buft); } -inline void * aligned_malloc_host(size_t alignment, size_t size) { -#ifdef _WIN32 - return _aligned_malloc(size, alignment); -#else - return aligned_alloc(alignment, size); -#endif -} - -inline void free_aligned_mem_host(void * memblock) { -#ifdef _WIN32 - _aligned_free(memblock); -#else - free(memblock); -#endif -} - static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { free_aligned_mem_host((void *)buffer->context); } @@ -1947,69 +2011,6 @@ static void scale_f32(const float * x, float * dst, const float scale, const flo } -template -static void pool2d_nchw_kernel( - const int ih, const int iw, const int oh, const int ow, - const int kh, const int kw, const int sh, const int sw, - const int ph, const int pw, const int parallel_elements, - const Ti* src, To* dst, const enum ggml_op_pool op, - const sycl::nd_item<3> &item_ct1) { - int idx = item_ct1.get_local_id(2) + - item_ct1.get_group(2) * item_ct1.get_local_range(2); - if (idx >= parallel_elements) { - return; - } - - const int I_HW = ih * iw; - const int O_HW = oh * ow; - const int nc = idx / O_HW; - const int cur_oh = idx % O_HW / ow; - const int cur_ow = idx % O_HW % ow; - const Ti* i_ptr = src + nc * I_HW; - To* o_ptr = dst + nc * O_HW; - const int start_h = cur_oh * sh - ph; - const int bh = sycl::max(0, start_h); - const int eh = sycl::min(ih, start_h + kh); - const int start_w = cur_ow * sw - pw; - const int bw = sycl::max(0, start_w); - const int ew = sycl::min(iw, start_w + kw); - - To res = 0; - - switch (op) { - case GGML_OP_POOL_AVG: res = 0; break; - case GGML_OP_POOL_MAX: res = -FLT_MAX; break; - default: - res = (To) sycl::nan(uint32_t(0)); - break; - } - - for (int i = bh; i < eh; i += 1) { - for (int j = bw; j < ew; j += 1) { -#if DPCT_COMPATIBILITY_TEMP >= 350 - /* - DPCT1098:106: The '*' expression is used instead of the __ldg - call. These two expressions do not provide the exact same - functionality. Check the generated code for potential precision - and/or performance issues. - */ - Ti cur = *(i_ptr + i * iw + j); -#else - Ti cur = i_ptr[i * iw + j]; -#endif - switch (op) { - case GGML_OP_POOL_AVG: res += (cur / (kh * kw)); break; - case GGML_OP_POOL_MAX: res = sycl::max(res, (To)cur); break; - default: - res = (To) sycl::nan(uint32_t(0)); - break; - } - } - } - o_ptr[cur_oh * ow + cur_ow] = res; -} - - static void ggml_mul_mat_p021_f16_f32_sycl(const void *vx, const float *y, float *dst, const int ncols_x, const int nrows_x, @@ -2558,45 +2559,6 @@ catch (sycl::exception const &exc) { std::exit(1); } -static void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { - GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - dpct::queue_ptr main_stream = ctx.stream(); - SYCL_CHECK(ggml_sycl_set_device(ctx.device)); - const float * src0_dd = static_cast(dst->src[0]->data); - float * dst_dd = static_cast(dst->data); - - const int32_t * opts = (const int32_t *)dst->op_params; - enum ggml_op_pool op = static_cast(opts[0]); - const int k0 = opts[1]; - const int k1 = opts[2]; - const int s0 = opts[3]; - const int s1 = opts[4]; - const int p0 = opts[5]; - const int p1 = opts[6]; - - const int64_t IH = dst->src[0]->ne[1]; - const int64_t IW = dst->src[0]->ne[0]; - - const int64_t N = dst->ne[3]; - const int64_t OC = dst->ne[2]; - const int64_t OH = dst->ne[1]; - const int64_t OW = dst->ne[0]; - - const int parallel_elements = N * OC * OH * OW; - const int num_blocks = (parallel_elements + SYCL_POOL2D_BLOCK_SIZE - 1) / SYCL_POOL2D_BLOCK_SIZE; - sycl::range<3> block_nums(1, 1, num_blocks); - main_stream->parallel_for( - sycl::nd_range<3>(block_nums * - sycl::range<3>(1, 1, SYCL_IM2COL_BLOCK_SIZE), - sycl::range<3>(1, 1, SYCL_IM2COL_BLOCK_SIZE)), - [=](sycl::nd_item<3> item_ct1) { - pool2d_nchw_kernel(IH, IW, OH, OW, k1, k0, s1, s0, p1, p0, - parallel_elements, src0_dd, dst_dd, op, - item_ct1); - }); -} - inline void ggml_sycl_op_sum(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); @@ -3056,7 +3018,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten src1_ddf_i_source += (i0 * ne11 + src1_col_0) * ne10; SYCL_CHECK( - CHECK_TRY_ERROR(dev2dev_memcpy(*stream, *main_stream, src1_ddf_i, src1_ddf_i_source, + CHECK_TRY_ERROR(dev2dev_memcpy(i, *stream, ctx.device, *main_stream, src1_ddf_i, src1_ddf_i_source, src1_ncols * ne10 * sizeof(float)))); } } @@ -3546,6 +3508,7 @@ inline bool ggml_sycl_supports_mmq(enum ggml_type type) { inline bool ggml_sycl_supports_reorder_mul_mat_sycl(enum ggml_type type) { switch (type) { + case GGML_TYPE_Q1_0: case GGML_TYPE_Q4_0: case GGML_TYPE_Q8_0: return true; @@ -3561,6 +3524,7 @@ inline bool ggml_sycl_supports_reorder_mul_mat_sycl(enum ggml_type type) { inline bool ggml_sycl_supports_reorder_dmmv(enum ggml_type type) { switch (type) { + case GGML_TYPE_Q1_0: case GGML_TYPE_Q4_0: case GGML_TYPE_Q8_0: return true; @@ -3571,6 +3535,7 @@ inline bool ggml_sycl_supports_reorder_dmmv(enum ggml_type type) { inline bool ggml_sycl_supports_reorder_mmvq(enum ggml_type type) { switch (type) { + case GGML_TYPE_Q1_0: case GGML_TYPE_Q4_0: case GGML_TYPE_Q8_0: case GGML_TYPE_Q3_K: @@ -3585,6 +3550,7 @@ inline bool ggml_sycl_supports_reorder_mmvq(enum ggml_type type) { static bool ggml_sycl_supports_dmmv(enum ggml_type type) { switch (type) { + case GGML_TYPE_Q1_0: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: @@ -3793,6 +3759,149 @@ static bool reorder_qw_q4_k(uint8_t * data_device, size_t size, size_t offset, d return true; } +// Reorder each expert slice into a self-contained SoA layout. +static bool reorder_qw_q4_k_moe(uint8_t * data_device, size_t expert_bytes, int64_t n_expert, dpct::queue_ptr stream) { + GGML_ASSERT(expert_bytes % sizeof(block_q4_K) == 0); + const int blocks_per_expert = (int) (expert_bytes / sizeof(block_q4_K)); + const size_t total_bytes = expert_bytes * (size_t) n_expert; + + sycl_reorder_temp_buffer tmp(stream, total_bytes); + if (!tmp) { + GGML_LOG_WARN("%s: failed to allocate %zu bytes for reorder temp buffer, skipping reorder\n", __func__, total_bytes); + return false; + } + uint8_t * tmp_buf = static_cast(tmp.ptr); + + sycl::event copy_event; + SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, total_bytes))); + if (!g_ggml_sycl_use_async_mem_op) { + copy_event.wait(); + } + + const int total_blocks = blocks_per_expert * (int) n_expert; + auto reorder_event = stream->parallel_for(total_blocks, [=](auto gb_) { + const int gb = gb_; + const int e = gb / blocks_per_expert; + const int ib = gb % blocks_per_expert; + const block_q4_K * x = (const block_q4_K *) (tmp_buf + (size_t) e * expert_bytes); + uint8_t * base = data_device + (size_t) e * expert_bytes; + + auto * qs_ptr = base; + auto * scales_ptr = qs_ptr + QK_K / 2 * blocks_per_expert; + auto * dm_ptr = (sycl::half2 *) (scales_ptr + K_SCALE_SIZE * blocks_per_expert); + + for (int j = 0; j < QK_K / 2; ++j) { + qs_ptr[ib * (QK_K / 2) + j] = x[ib].qs[j]; + } + for (int j = 0; j < K_SCALE_SIZE; ++j) { + scales_ptr[ib * K_SCALE_SIZE + j] = x[ib].scales[j]; + } + dm_ptr[ib] = x[ib].dm; + }); + if (!g_ggml_sycl_use_async_mem_op) { + reorder_event.wait_and_throw(); + } + return true; +} + +// Reorder each Q5_K expert slice into [qs][qh][scales][dm]. +static bool reorder_qw_q5_k_moe(uint8_t * data_device, size_t expert_bytes, int64_t n_expert, dpct::queue_ptr stream) { + GGML_ASSERT(expert_bytes % sizeof(block_q5_K) == 0); + const int blocks_per_expert = (int) (expert_bytes / sizeof(block_q5_K)); + const size_t total_bytes = expert_bytes * (size_t) n_expert; + + sycl_reorder_temp_buffer tmp(stream, total_bytes); + if (!tmp) { + GGML_LOG_WARN("%s: failed to allocate %zu bytes for reorder temp buffer, skipping reorder\n", __func__, total_bytes); + return false; + } + uint8_t * tmp_buf = static_cast(tmp.ptr); + + sycl::event copy_event; + SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, total_bytes))); + if (!g_ggml_sycl_use_async_mem_op) { + copy_event.wait(); + } + + const int total_blocks = blocks_per_expert * (int) n_expert; + auto reorder_event = stream->parallel_for(total_blocks, [=](auto gb_) { + const int gb = gb_; + const int e = gb / blocks_per_expert; + const int ib = gb % blocks_per_expert; + const block_q5_K * x = (const block_q5_K *) (tmp_buf + (size_t) e * expert_bytes); + uint8_t * base = data_device + (size_t) e * expert_bytes; + + auto * qs_ptr = base; + auto * qh_ptr = qs_ptr + (QK_K / 2) * blocks_per_expert; + auto * scales_ptr = qh_ptr + (QK_K / 8) * blocks_per_expert; + auto * dm_ptr = (sycl::half2 *) (scales_ptr + K_SCALE_SIZE * blocks_per_expert); + + for (int j = 0; j < QK_K / 2; ++j) { + qs_ptr[ib * (QK_K / 2) + j] = x[ib].qs[j]; + } + for (int j = 0; j < QK_K / 8; ++j) { + qh_ptr[ib * (QK_K / 8) + j] = x[ib].qh[j]; + } + for (int j = 0; j < K_SCALE_SIZE; ++j) { + scales_ptr[ib * K_SCALE_SIZE + j] = x[ib].scales[j]; + } + dm_ptr[ib] = x[ib].dm; + }); + if (!g_ggml_sycl_use_async_mem_op) { + reorder_event.wait_and_throw(); + } + return true; +} + +// Reorder each Q6_K expert slice into [ql][qh][scales][d]. +static bool reorder_qw_q6_k_moe(uint8_t * data_device, size_t expert_bytes, int64_t n_expert, dpct::queue_ptr stream) { + GGML_ASSERT(expert_bytes % sizeof(block_q6_K) == 0); + const int blocks_per_expert = (int) (expert_bytes / sizeof(block_q6_K)); + const size_t total_bytes = expert_bytes * (size_t) n_expert; + + sycl_reorder_temp_buffer tmp(stream, total_bytes); + if (!tmp) { + GGML_LOG_WARN("%s: failed to allocate %zu bytes for reorder temp buffer, skipping reorder\n", __func__, total_bytes); + return false; + } + uint8_t * tmp_buf = static_cast(tmp.ptr); + + sycl::event copy_event; + SYCL_CHECK(CHECK_TRY_ERROR(copy_event = stream->memcpy(tmp_buf, data_device, total_bytes))); + if (!g_ggml_sycl_use_async_mem_op) { + copy_event.wait(); + } + + const int total_blocks = blocks_per_expert * (int) n_expert; + auto reorder_event = stream->parallel_for(total_blocks, [=](auto gb_) { + const int gb = gb_; + const int e = gb / blocks_per_expert; + const int ib = gb % blocks_per_expert; + const block_q6_K * x = (const block_q6_K *) (tmp_buf + (size_t) e * expert_bytes); + uint8_t * base = data_device + (size_t) e * expert_bytes; + + auto * ql_ptr = base; + auto * qh_ptr = ql_ptr + (QK_K / 2) * blocks_per_expert; + auto * scales_ptr = qh_ptr + (QK_K / 4) * blocks_per_expert; + auto * d_ptr = (sycl::half *) (scales_ptr + (QK_K / 16) * blocks_per_expert); + + for (int j = 0; j < QK_K / 2; ++j) { + ql_ptr[ib * (QK_K / 2) + j] = x[ib].ql[j]; + } + for (int j = 0; j < QK_K / 4; ++j) { + qh_ptr[ib * (QK_K / 4) + j] = x[ib].qh[j]; + } + for (int j = 0; j < QK_K / 16; ++j) { + scales_ptr[ib * (QK_K / 16) + j] = x[ib].scales[j]; + } + d_ptr[ib] = x[ib].d; + }); + if (!g_ggml_sycl_use_async_mem_op) { + reorder_event.wait_and_throw(); + } + return true; +} + static bool reorder_qw_q3_k(uint8_t * data_device, size_t size, size_t offset, dpct::queue_ptr stream) { GGML_ASSERT(size % sizeof(block_q3_K) == 0); GGML_ASSERT(offset % sizeof(block_q3_K) == 0); @@ -3948,6 +4057,22 @@ static bool reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) { size_t nrows = src0->ne[1]; size_t size = ggml_nbytes(src0); + // MoE expert weights are addressed per expert via nb[2], so each slice must + // remain self-contained after reorder. + if (src0->ne[2] > 1) { + GGML_ASSERT((size_t) size == (size_t) src0->ne[2] * src0->nb[2]); + switch (src0->type) { + case GGML_TYPE_Q4_K: + return reorder_qw_q4_k_moe(data_device, src0->nb[2], src0->ne[2], stream); + case GGML_TYPE_Q5_K: + return reorder_qw_q5_k_moe(data_device, src0->nb[2], src0->ne[2], stream); + case GGML_TYPE_Q6_K: + return reorder_qw_q6_k_moe(data_device, src0->nb[2], src0->ne[2], stream); + default: + return false; + } + } + switch (src0->type) { case GGML_TYPE_Q4_0: return reorder_qw_q4_0(data_device, ncols, nrows, size, 0, stream); @@ -3962,7 +4087,6 @@ static bool reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) { case GGML_TYPE_Q6_K: return reorder_qw_q6_k(data_device, size, 0, stream); default: - GGML_ABORT("reorder_qw() called with unsupported type"); return false; } } @@ -4010,6 +4134,23 @@ static void opt_for_reorder(ggml_backend_sycl_context * ctx, const ggml_tensor * } } +// Lazily reorder supported MoE expert weights once their fused path is used. +static void opt_for_reorder_id(ggml_backend_sycl_context * ctx, const ggml_tensor * src0) { + if (g_ggml_sycl_disable_optimize || !ctx->opt_feature.reorder) { + return; + } + if (src0->type != GGML_TYPE_Q4_K && src0->type != GGML_TYPE_Q5_K && src0->type != GGML_TYPE_Q6_K) { + return; + } + ggml_tensor_extra_gpu * extra = static_cast(src0->extra); + if (!extra || extra->optimized_feature.reorder) { + return; + } + if (reorder_qw(src0, ctx->stream())) { + extra->optimized_feature.reorder = true; + } +} + static bool can_use_dequantize_mul_mat_vec(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { // The F16/BF16 qk=1 kernel iterates with stride 2*DMMV_X, requiring ne[0] to be @@ -4115,11 +4256,6 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor } -struct mmid_row_mapping { - int32_t i1; - int32_t i2; -}; - __dpct_inline__ static void k_copy_src1_to_contiguous( const char *__restrict__ src1_original, char *__restrict__ src1_contiguous, const mmid_row_mapping *__restrict__ row_mapping, @@ -4175,11 +4311,6 @@ static bool ggml_sycl_mul_mat_id_mmvq_fused( if (ne10 != src0->ne[0] || ne10 % QK8_1 != 0) return false; if (!ggml_is_contiguous(src1)) return false; - // Reorder layout not supported; fall back. - const ggml_tensor_extra_gpu * src0_extra = - static_cast(src0->extra); - if (src0_extra && src0_extra->optimized_feature.reorder) return false; - const int64_t n_ids_per_group = ids->ne[0]; if (ids->ne[1] != 1) return false; if (ne11 != 1 && ne11 != n_ids_per_group) return false; @@ -4189,16 +4320,37 @@ static bool ggml_sycl_mul_mat_id_mmvq_fused( const int n_experts_used = (int) n_ids_per_group; const int nrows = (int) src0->ne[1]; + // Lazily reorder the (Q4_K) expert weights into a per-expert SoA layout, then run the reorder + // GEMV. Placed after the bail checks so a non-dispatchable op does not pay the reorder cost. + opt_for_reorder_id(&ctx, src0); + const ggml_tensor_extra_gpu * src0_extra = + static_cast(src0->extra); + const bool use_reorder = src0_extra && src0_extra->optimized_feature.reorder; + ggml_sycl_pool_alloc src1_q8_alloc(ctx.pool(), (size_t) ne11 * src1_padded_cols * sizeof(block_q8_1) / QK8_1); char * src1_ddq = src1_q8_alloc.get(); - quantize_row_q8_1_sycl( - (const float *) src1->data, src1_ddq, (int) ne10, (int) ne11, - src1_padded_cols, stream); + if (use_reorder) { + quantize_row_q8_1_sycl( + (const float *) src1->data, src1_ddq, (int) ne10, (int) ne11, + src1_padded_cols, stream); + } else { + quantize_row_q8_1_sycl( + (const float *) src1->data, src1_ddq, (int) ne10, (int) ne11, + src1_padded_cols, stream); + } const size_t bytes_per_qrow = (size_t) src1_padded_cols * sizeof(block_q8_1) / QK8_1; const size_t src1_row_stride = (ne11 == 1) ? 0 : bytes_per_qrow; + if (use_reorder) { + return ggml_sycl_mul_mat_vec_q_id_reorder( + src0->type, src0->data, src1_ddq, (const int32_t *) ids->data, + (float *) dst->data, (int) ne10, nrows, n_experts_used, + /*expert_weight_stride=*/ src0->nb[2], + /*dst_row_stride=*/ dst->nb[1], + src1_row_stride, stream); + } return ggml_sycl_mul_mat_vec_q_id( src0->type, src0->data, src1_ddq, (const int32_t *) ids->data, (float *) dst->data, (int) ne10, nrows, n_experts_used, @@ -4274,6 +4426,8 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx, SYCL_CHECK(CHECK_TRY_ERROR( stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids)))); + + // also ensures ctx.mmid_row_mapping_host is drained before we use it again SYCL_CHECK(CHECK_TRY_ERROR(stream->wait())); ggml_tensor src0_row = *src0; @@ -4331,7 +4485,7 @@ static void ggml_sycl_mul_mat_id(ggml_backend_sycl_context & ctx, // where each expert's slice starts and the previous ends (row indices, right-exclusive) std::vector expert_row_offsets; // the sources (slot/token pairs) of contiguous rows to guide k_copy_src1_to_contiguous - std::vector routed_row_src; + std::vector & routed_row_src = ctx.mmid_row_mapping_host; mmid_counting_sort_rows(ids, ids_host.data(), n_ids, n_as, n_routed_rows, expert_row_counts, expert_row_offsets, routed_row_src); @@ -4435,6 +4589,11 @@ static void ggml_sycl_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) ggml_sycl_op_pool2d(ctx, dst); } +static void ggml_sycl_pool1d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); + ggml_sycl_op_pool1d(ctx, dst); +} + static void ggml_sycl_im2col(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2); ggml_sycl_op_im2col(ctx, dst); @@ -4445,6 +4604,11 @@ static void ggml_sycl_im2col_3d(ggml_backend_sycl_context & ctx, ggml_tensor * d ggml_sycl_op_im2col_3d(ctx, dst); } +static void ggml_sycl_conv_3d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2); + ggml_sycl_op_conv_3d(ctx, dst); +} + static void ggml_sycl_sum(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); GGML_ASSERT(ggml_is_contiguous(dst->src[0])); @@ -4508,9 +4672,21 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg case GGML_OP_ARGMAX: ggml_sycl_argmax(ctx, dst); break; + case GGML_OP_CONV_2D: + ggml_sycl_op_conv2d(ctx, dst); + break; + case GGML_OP_CONV_2D_DW: + ggml_sycl_op_conv2d_dw(ctx, dst); + break; + case GGML_OP_CONV_3D: + ggml_sycl_conv_3d(ctx, dst); + break; case GGML_OP_CONV_TRANSPOSE_1D: ggml_sycl_op_conv_transpose_1d(ctx, dst); break; + case GGML_OP_CONV_TRANSPOSE_2D: + ggml_sycl_op_conv2d_transpose(ctx, dst); + break; case GGML_OP_REPEAT: ggml_sycl_repeat(ctx, dst); break; @@ -4592,6 +4768,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg case GGML_UNARY_OP_EXP: ggml_sycl_exp(ctx, dst); break; + case GGML_UNARY_OP_EXPM1: + ggml_sycl_expm1(ctx, dst); + break; case GGML_UNARY_OP_SOFTPLUS: ggml_sycl_softplus(ctx, dst); break; @@ -4748,6 +4927,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg case GGML_OP_POOL_2D: ggml_sycl_pool2d(ctx, dst); break; + case GGML_OP_POOL_1D: + ggml_sycl_pool1d(ctx, dst); + break; case GGML_OP_SUM: ggml_sycl_sum(ctx, dst); break; @@ -5208,7 +5390,7 @@ static ggml_backend_buffer_t ggml_backend_sycl_device_buffer_from_host_ptr(ggml_ return nullptr; } -static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { +static bool do_ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { ggml_backend_sycl_device_context *sycl_ctx = (ggml_backend_sycl_device_context *)dev->context; int device = sycl_ctx->device; @@ -5222,6 +5404,10 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g } return false; } + case GGML_OP_CONV_2D: + case GGML_OP_CONV_2D_DW: + case GGML_OP_CONV_TRANSPOSE_2D: + return true; case GGML_OP_UNARY: switch (ggml_get_unary_op(op)) { case GGML_UNARY_OP_SGN: @@ -5238,6 +5424,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g case GGML_UNARY_OP_GELU_QUICK: case GGML_UNARY_OP_GELU_ERF: case GGML_UNARY_OP_EXP: + case GGML_UNARY_OP_EXPM1: case GGML_UNARY_OP_SOFTPLUS: case GGML_UNARY_OP_ELU: case GGML_UNARY_OP_CEIL: @@ -5245,11 +5432,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g case GGML_UNARY_OP_FLOOR: case GGML_UNARY_OP_ROUND: case GGML_UNARY_OP_TRUNC: -#if defined (GGML_SYCL_F16) - return ggml_is_contiguous(op->src[0]) && (op->type == op->src[0]->type); -#else - return ggml_is_contiguous(op->src[0]) && (op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32) && (op->type == op->src[0]->type); -#endif + return true; default: return false; } @@ -5272,19 +5455,12 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g struct ggml_tensor * a = op->src[0]; struct ggml_tensor * b = op->src[1]; - // disable Q1_0 until implementation - if (a->type == GGML_TYPE_Q1_0 || b->type == GGML_TYPE_Q1_0) { - return false; - } - if (a->ne[3] != b->ne[3]) { return false; } ggml_type src0_type = op->src[0]->type; - - // TODO: The configuration below needs more work to be supported with oneDNN if (ggml_is_permuted(a) && !ggml_is_contiguous(a) && a->ne[2] > 1 && a->ne[3] > 1 && src0_type == GGML_TYPE_F16) { @@ -5294,12 +5470,17 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g // TODO: This specific configuration can fail with oneDNN and needs more debugging if (!ggml_is_permuted(a) && ggml_is_permuted(b) && b->ne[2] > 1 && b->ne[3] > 1 && a->ne[0] > 128 && a->ne[2] == 1 && src0_type == GGML_TYPE_F16) { + printf("zjy 2\n"); return false; } return true; } case GGML_OP_OUT_PROD: - return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->ne[2] == 1 && op->ne[3] == 1; + return op->type == GGML_TYPE_F32 && + (op->src[0]->type == GGML_TYPE_F32 || + (op->src[0]->type == GGML_TYPE_Q1_0 && op->src[0]->ne[2] == op->src[1]->ne[2] && + op->src[0]->ne[3] == op->src[1]->ne[3])) && + op->src[1]->type == GGML_TYPE_F32; case GGML_OP_GET_ROWS: { switch (op->src[0]->type) { @@ -5342,10 +5523,15 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g case GGML_OP_SET_ROWS: { - return ((op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_BF16 || + + auto res = ((op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16 || op->type == GGML_TYPE_BF16 || op->type == GGML_TYPE_Q8_0 || op->type == GGML_TYPE_Q5_1 || op->type == GGML_TYPE_Q5_0 || - op->type == GGML_TYPE_Q4_1 || op->type == GGML_TYPE_Q4_0 || op->type == GGML_TYPE_IQ4_NL) && + op->type == GGML_TYPE_Q1_0 || + op->type == GGML_TYPE_Q4_1 || op->type == GGML_TYPE_Q4_0 || op->type == GGML_TYPE_IQ4_NL || + op->type == GGML_TYPE_MXFP4 || op->type == GGML_TYPE_NVFP4) && + op->src[0]->type == GGML_TYPE_F32 && (op->src[1]->type == GGML_TYPE_I64 || op->src[1]->type == GGML_TYPE_I32)); + return res; } break; case GGML_OP_CPY: @@ -5447,11 +5633,6 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g case GGML_OP_COS: case GGML_OP_CLAMP: case GGML_OP_LOG: -#if defined (GGML_SYCL_F16) - return ((op->type == GGML_TYPE_F32 || op->type == GGML_SYCL_F16) && (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_SYCL_F16) && (op->type == op->src[0]->type)); -#else - return (op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32) && (op->type == op->src[0]->type); -#endif case GGML_OP_NORM: case GGML_OP_L2_NORM: case GGML_OP_GROUP_NORM: @@ -5485,6 +5666,12 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g case GGML_OP_IM2COL_3D: case GGML_OP_UPSCALE: return true; + case GGML_OP_CONV_3D: + return op->type == GGML_TYPE_F32 && + (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) && + op->src[1]->type == GGML_TYPE_F32 && + ggml_is_contiguous(op->src[0]) && + ggml_is_contiguous(op->src[1]); case GGML_OP_SUM: case GGML_OP_SUM_ROWS: case GGML_OP_MEAN: @@ -5502,6 +5689,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g k > 0 && k <= 32; } case GGML_OP_POOL_2D: + case GGML_OP_POOL_1D: return true; case GGML_OP_ACC: return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]); @@ -5549,6 +5737,13 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g GGML_UNUSED(dev); } +static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) { + bool res = do_ggml_backend_sycl_device_supports_op(dev, op); + GGML_SYCL_DEBUG("[SYCL] call %s op->op=%s op->type=%s -> %s\n", __func__, ggml_op_name(op->op), + ggml_type_name(op->type), res ? "true" : "false"); + return res; +} + static bool ggml_backend_sycl_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { if (buft->iface.get_name != ggml_backend_sycl_buffer_type_get_name) { return false; @@ -5664,6 +5859,250 @@ static ggml_backend_dev_t ggml_backend_sycl_reg_get_device(ggml_backend_reg_t re return ctx->devices[index]; } +// ========================================================================== +// Tensor parallelism (--split-mode tensor) for the SYCL backend. +// +// The meta-backend invokes these three entry points via get_proc_address: +// * ggml_backend_sycl_comm_init - one-time per-graph setup +// * ggml_backend_sycl_comm_allreduce_tensor - per-allreduce step +// * ggml_backend_sycl_comm_free - tear-down +// +// For N=2 (dual-GPU), this is a degenerate ring allreduce with dual paths +// chosen by tensor size: +// +// * Small (nelem < 32K): FP32 direct memcpy + per-device ADD +// kernel. The kernel depends_on() its corresponding memcpy event +// so it doesn't read partial data. Both devices run in parallel. +// +// * Large (nelem >= 32K): BF16-compressed. Each device compresses +// its FP32 partial to BF16 locally, cross-device memcpys +// to the peer (half the PCI bandwidth), where it is decompressed +// and added into the local FP32 partial. 6 SYCL submissions per +// allreduce (2 compress + 2 memcpy + 2 decompress-add) vs the +// 4 for the small path, but the bandwidth saving > 6 GB/s PCIe x 2 +// dominates for larger tensors. +// +// Storage: A persistent uint8_t buffer per device, sized to +// 4 * nelem bytes. Both paths reinterpret the same bytes (small path +// as nelem floats; large path as outbox + inbox = 2*nelem uint16_t +// each, using the full 4*nelem byte budget either way). Single +// alloc+free per device keeps the SYCL pool's strict-LIFO invariant +// trivial. +// +// For non-(N=2 FP32 contiguous) cases, comm_init or comm_allreduce_tensor +// returns null/false, causing the meta-backend to use its generic +// butterfly all-reduce fallback. +// ========================================================================== + +struct ggml_backend_sycl_comm_context { + std::vector backends; + // ONE persistent per-device byte buffer, 4*nelem bytes. Both the + // FP32 small-tensor path and the BF16 large-tensor path share it + // by reinterpreting. + std::unique_ptr> buf0; + std::unique_ptr> buf1; + int64_t buf_nelem = 0; +}; + +void * ggml_backend_sycl_comm_init(ggml_backend_t * backends, size_t n_backends) try { + for (size_t i = 0; i < n_backends; ++i) { + if (!ggml_backend_is_sycl(backends[i])) { + return nullptr; + } + } + + // Initial version: N=2 only. For N!=2, returning null makes the + // meta-backend skip this backend-specific allreduce entirely. + if (n_backends != 2) { + return nullptr; + } + + auto * ctx = new ggml_backend_sycl_comm_context; + ctx->backends.assign(backends, backends + n_backends); + auto * sctx0 = (ggml_backend_sycl_context *) backends[0]->context; + auto * sctx1 = (ggml_backend_sycl_context *) backends[1]->context; + ctx->buf0 = std::make_unique>(sctx0->pool()); + ctx->buf1 = std::make_unique>(sctx1->pool()); + return ctx; +} +catch (const sycl::exception &) { return nullptr; } +catch (...) { return nullptr; } + +void ggml_backend_sycl_comm_free(void * comm_ctx_v) { + auto * comm_ctx = static_cast(comm_ctx_v); + if (comm_ctx == nullptr) { + return; + } + + // Sync both per-device queues so the pool_alloc destructors don't + // return memory still in use by the last kernel. + if (comm_ctx->backends.size() == 2) { + auto * sctx0 = (ggml_backend_sycl_context *) comm_ctx->backends[0]->context; + auto * sctx1 = (ggml_backend_sycl_context *) comm_ctx->backends[1]->context; + try { + sctx0->stream()->wait(); + sctx1->stream()->wait(); + } catch (...) { /* best effort during shutdown */ } + } + + delete comm_ctx; +} + +bool ggml_backend_sycl_comm_allreduce_tensor(void * comm_ctx_v, struct ggml_tensor ** tensors) try { + if (comm_ctx_v == nullptr) { + return false; + } + + auto * comm_ctx = static_cast(comm_ctx_v); + const size_t n_backends = comm_ctx->backends.size(); + + // Fast path: N=2, F32/F16, contiguous, matching shapes. + if (n_backends != 2) { + return false; + } + // Accept F32 or F16 inputs natively (types must match). F16 takes the + // direct 2-byte memcpy + add path below; other types return false so the + // meta-backend uses its generic all-reduce. + if (tensors[0]->type != tensors[1]->type) { + return false; + } + if (tensors[0]->type != GGML_TYPE_F32 && tensors[0]->type != GGML_TYPE_F16) { + return false; + } + if (!ggml_is_contiguous(tensors[0]) || !ggml_is_contiguous(tensors[1])) { + return false; + } + if (ggml_nelements(tensors[0]) != ggml_nelements(tensors[1])) { + return false; + } + + const int64_t nelem = ggml_nelements(tensors[0]); + const size_t nbytes = ggml_nbytes(tensors[0]); + if (nelem == 0) { + return true; + } + + auto * ctx0 = (ggml_backend_sycl_context *) comm_ctx->backends[0]->context; + auto * ctx1 = (ggml_backend_sycl_context *) comm_ctx->backends[1]->context; + queue_ptr q0 = ctx0->stream(); + queue_ptr q1 = ctx1->stream(); + + // Grow per-device byte buffers if needed (4 * nelem bytes each). + if (comm_ctx->buf_nelem < nelem) { + comm_ctx->buf0->realloc(nelem * 4); + comm_ctx->buf1->realloc(nelem * 4); + comm_ctx->buf_nelem = nelem; + } + uint8_t * buf0 = comm_ctx->buf0->get(); + uint8_t * buf1 = comm_ctx->buf1->get(); + + // F16 native path: direct 2-byte cross-device copy + add, skipping the + // F32 round-trip the meta-backend fallback would force. Cross-device copies + // go through dev2dev_memcpy because the two devices are in separate SYCL + // contexts (a raw peer-USM q->memcpy would be a silent no-op). + if (tensors[0]->type == GGML_TYPE_F16) { + sycl::half * f16_out0 = (sycl::half *) tensors[0]->data; + sycl::half * f16_out1 = (sycl::half *) tensors[1]->data; + sycl::half * f16_tmp0 = (sycl::half *) buf0; + sycl::half * f16_tmp1 = (sycl::half *) buf1; + + q0->wait(); + q1->wait(); + dev2dev_memcpy(ctx0->device, *q0, ctx1->device, *q1, f16_tmp0, tensors[1]->data, nbytes); + dev2dev_memcpy(ctx1->device, *q1, ctx0->device, *q0, f16_tmp1, tensors[0]->data, nbytes); + + q0->submit([&](sycl::handler & h) { + h.parallel_for(sycl::range<1>(nelem), [=](sycl::id<1> i) { + f16_out0[i] = (sycl::half) ((float) f16_out0[i] + (float) f16_tmp0[i]); + }); + }); + q1->submit([&](sycl::handler & h) { + h.parallel_for(sycl::range<1>(nelem), [=](sycl::id<1> i) { + f16_out1[i] = (sycl::half) ((float) f16_out1[i] + (float) f16_tmp1[i]); + }); + }); + return true; + } + + float * out0 = (float *) tensors[0]->data; + float * out1 = (float *) tensors[1]->data; + + // BF16 threshold: above this, the PCIe savings from halving the + // cross-device bytes outweigh the 2 extra compress kernels. + // Below: stay on the FP32 fast path. Threshold mirrors the CUDA + // NCCL allreduce pattern for n_backends=2. + static constexpr int64_t BF16_THRESHOLD = 32768; + + if (nelem < BF16_THRESHOLD) { + // FP32 small path: 4 SYCL submissions per allreduce. + float * tmp0 = (float *) buf0; + float * tmp1 = (float *) buf1; + + // COMM-D2D-FIX: the two devices are in SEPARATE SYCL contexts, so a raw + // q->memcpy of a peer USM pointer is a silent no-op. Route cross-device + // copies through dev2dev_memcpy (L0 direct copy / host staging). It is + // synchronous, so wait for the local partials to be produced first. + q0->wait(); + q1->wait(); + dev2dev_memcpy(ctx0->device, *q0, ctx1->device, *q1, tmp0, tensors[1]->data, nbytes); + dev2dev_memcpy(ctx1->device, *q1, ctx0->device, *q0, tmp1, tensors[0]->data, nbytes); + + q0->submit([&](sycl::handler & h) { + h.parallel_for(sycl::range<1>(nelem), [=](sycl::id<1> i) { + out0[i] += tmp0[i]; + }); + }); + q1->submit([&](sycl::handler & h) { + h.parallel_for(sycl::range<1>(nelem), [=](sycl::id<1> i) { + out1[i] += tmp1[i]; + }); + }); + return true; + } + + // BF16 large path: 6 SYCL submissions per allreduce, but the + // cross-device memcpy is HALF the bytes. Pure bit-shift + // conversion (no rounding) — matches ggml's truncating fp32->bf16. + uint16_t * outbox0 = (uint16_t *) buf0; + uint16_t * inbox0 = outbox0 + nelem; + uint16_t * outbox1 = (uint16_t *) buf1; + uint16_t * inbox1 = outbox1 + nelem; + + // Phase A: compress each device's local partial in parallel. + sycl::event c0 = q0->parallel_for(sycl::range<1>(nelem), [=](sycl::id<1> i) { + outbox0[i] = (uint16_t) (sycl::bit_cast(out0[i]) >> 16); + }); + + sycl::event c1 = q1->parallel_for(sycl::range<1>(nelem), [=](sycl::id<1> i) { + outbox1[i] = (uint16_t) (sycl::bit_cast(out1[i]) >> 16); + }); + + // Phase B: COMM-D2D-FIX-BF16 cross-device copy of compressed bytes via + // dev2dev_memcpy (separate SYCL contexts; sync copy after compress). + const size_t bf16_bytes = nelem * sizeof(uint16_t); + c0.wait(); + c1.wait(); + dev2dev_memcpy(ctx0->device, *q0, ctx1->device, *q1, inbox0, outbox1, bf16_bytes); + dev2dev_memcpy(ctx1->device, *q1, ctx0->device, *q0, inbox1, outbox0, bf16_bytes); + + // Phase C: decompress + add into local FP32 partial. + q0->submit([&](sycl::handler & h) { + h.parallel_for(sycl::range<1>(nelem), [=](sycl::id<1> i) { + out0[i] += sycl::bit_cast(((uint32_t) inbox0[i]) << 16); + }); + }); + + q1->submit([&](sycl::handler & h) { + h.parallel_for(sycl::range<1>(nelem), [=](sycl::id<1> i) { + out1[i] += sycl::bit_cast(((uint32_t) inbox1[i]) << 16); + }); + }); + + return true; +} +catch (const sycl::exception &) { return false; } +catch (...) { return false; } + static void *ggml_backend_sycl_reg_get_proc_address(ggml_backend_reg_t reg, const char *name) { GGML_UNUSED(reg); @@ -5671,6 +6110,17 @@ static void *ggml_backend_sycl_reg_get_proc_address(ggml_backend_reg_t reg, cons return (void *)ggml_backend_sycl_split_buffer_type; } + // Tensor parallelism (--split-mode tensor) entry points. + if (strcmp(name, "ggml_backend_comm_init") == 0) { + return (void *)ggml_backend_sycl_comm_init; + } + if (strcmp(name, "ggml_backend_comm_free") == 0) { + return (void *)ggml_backend_sycl_comm_free; + } + if (strcmp(name, "ggml_backend_comm_allreduce_tensor") == 0) { + return (void *)ggml_backend_sycl_comm_allreduce_tensor; + } + // SYCL doesn't support registering host memory, left here for reference // "ggml_backend_register_host_buffer" // "ggml_backend_unregister_host_buffer" diff --git a/ggml/src/ggml-sycl/mmvq.cpp b/ggml/src/ggml-sycl/mmvq.cpp index cf2b59576aa1..7b1b3d467fa2 100644 --- a/ggml/src/ggml-sycl/mmvq.cpp +++ b/ggml/src/ggml-sycl/mmvq.cpp @@ -662,13 +662,12 @@ static void reorder_mul_mat_vec_q4_0_q8_1_sycl(const void * vx, const void * vy, GGML_ASSERT(ncols % QK4_0 == 0); // Round up to a whole number of subgroup-sized workgroups; out-of-range rows are skipped inside the kernel. constexpr size_t num_subgroups = WARP_SIZE; - const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups) * (int) num_subgroups; - - const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, (block_num_y * WARP_SIZE)); - const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); + const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups); + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); stream->submit([&](sycl::handler & cgh) { - cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size), + cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q_reorder>(vx, vy, dst, ncols, nrows, nd_item); @@ -683,13 +682,13 @@ static void reorder_mul_mat_vec_q4_0_q8_1_sycl_ncols( const int stride_col_y_bytes, const int stride_col_dst, dpct::queue_ptr stream) { GGML_ASSERT(ncols % QK4_0 == 0); - const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y); - constexpr size_t num_subgroups = 16; - GGML_ASSERT(block_num_y % num_subgroups == 0); - const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE); - const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); + constexpr size_t num_subgroups = WARP_SIZE; + const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups); + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); + stream->submit([&](sycl::handler & cgh) { - cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size), + cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q_reorder_ncols, ncols_dst>( vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, nd_item); @@ -1080,13 +1079,12 @@ static void reorder_mul_mat_vec_q8_0_q8_1_sycl(const void * vx, const void * vy, GGML_ASSERT(ncols % QK8_0 == 0); // Round up to a whole number of subgroup-sized workgroups; out-of-range rows are skipped inside the kernel. constexpr size_t num_subgroups = WARP_SIZE; - const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups) * (int) num_subgroups; - - const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, (block_num_y * WARP_SIZE)); - const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); + const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups); + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); stream->submit([&](sycl::handler & cgh) { - cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size), + cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q_reorder>(vx, vy, dst, ncols, nrows, nd_item); @@ -1101,13 +1099,13 @@ static void reorder_mul_mat_vec_q8_0_q8_1_sycl_ncols( const int stride_col_y_bytes, const int stride_col_dst, dpct::queue_ptr stream) { GGML_ASSERT(ncols % QK8_0 == 0); - const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y); - constexpr size_t num_subgroups = 16; - GGML_ASSERT(block_num_y % num_subgroups == 0); - const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE); - const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); + constexpr size_t num_subgroups = WARP_SIZE; + const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups); + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); + stream->submit([&](sycl::handler & cgh) { - cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size), + cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q_reorder_ncols, ncols_dst>( vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, nd_item); @@ -1196,6 +1194,66 @@ static void mul_mat_vec_q8_0_q8_1_sycl_switch_ncols( } } +static void mul_mat_vec_q1_0_q8_1_sycl(const void * vx, const void * vy, + float * dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK1_0 == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + + stream->submit([&](sycl::handler & cgh) { + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q( + vx, vy, dst, ncols, nrows, item_ct1); + }); + }); +} + +template +static void mul_mat_vec_q1_0_q8_1_sycl_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, + const int stride_col_y, const int stride_col_dst, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK1_0 == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + + stream->submit([&](sycl::handler & cgh) { + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_ncols( + vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, item_ct1); + }); + }); +} + +static void mul_mat_vec_q1_0_q8_1_sycl_switch_ncols( + const void * vx, const void * vy, float * dst, + const int ncols, const int nrows, const int ncols_dst, + const int stride_col_y, const int stride_col_dst, + dpct::queue_ptr stream) { + switch (ncols_dst) { + case 1: mul_mat_vec_q1_0_q8_1_sycl(vx, vy, dst, ncols, nrows, stream); break; + case 2: mul_mat_vec_q1_0_q8_1_sycl_ncols<2>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 3: mul_mat_vec_q1_0_q8_1_sycl_ncols<3>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 4: mul_mat_vec_q1_0_q8_1_sycl_ncols<4>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 5: mul_mat_vec_q1_0_q8_1_sycl_ncols<5>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 6: mul_mat_vec_q1_0_q8_1_sycl_ncols<6>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 7: mul_mat_vec_q1_0_q8_1_sycl_ncols<7>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + case 8: mul_mat_vec_q1_0_q8_1_sycl_ncols<8>(vx, vy, dst, ncols, nrows, stride_col_y, stride_col_dst, stream); break; + default: GGML_ABORT("unsupported ncols_dst=%d for Q1_0 multi-col MMVQ", ncols_dst); + } +} + static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy, float *dst, const int ncols, const int nrows, @@ -1289,13 +1347,12 @@ static void reorder_mul_mat_vec_q3_k_q8_1_sycl(const void * vx, const void * vy, // Round up to a whole number of subgroup-sized workgroups; out-of-range rows are skipped inside the kernel. constexpr size_t num_subgroups = WARP_SIZE; - const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups) * (int) num_subgroups; - - const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE); - const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); + const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups); + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); stream->submit([&](sycl::handler & cgh) { - cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size), + cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q_reorder>(vx, vy, dst, ncols, nrows, nd_item); @@ -1310,13 +1367,13 @@ static void reorder_mul_mat_vec_q3_k_q8_1_sycl_ncols( const int stride_col_y_bytes, const int stride_col_dst, dpct::queue_ptr stream) { GGML_ASSERT(ncols % QK_K == 0); - const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y); - constexpr size_t num_subgroups = 16; - GGML_ASSERT(block_num_y % num_subgroups == 0); - const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE); - const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); + constexpr size_t num_subgroups = WARP_SIZE; + const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups); + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); + stream->submit([&](sycl::handler & cgh) { - cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size), + cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q_reorder_ncols, ncols_dst>( vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, nd_item); @@ -1457,13 +1514,12 @@ static void reorder_mul_mat_vec_q4_k_q8_1_sycl(const void * vx, const void * vy, // Round up to a whole number of subgroup-sized workgroups; out-of-range rows are skipped inside the kernel. constexpr size_t num_subgroups = WARP_SIZE; - const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups) * (int) num_subgroups; - - const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE); - const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); + const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups); + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); stream->submit([&](sycl::handler & cgh) { - cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size), + cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q_reorder>(vx, vy, dst, ncols, nrows, nd_item); @@ -1478,13 +1534,14 @@ static void reorder_mul_mat_vec_q4_k_q8_1_sycl_ncols( const int stride_col_y_bytes, const int stride_col_dst, dpct::queue_ptr stream) { GGML_ASSERT(ncols % QK_K == 0); - const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y); - constexpr size_t num_subgroups = 16; - GGML_ASSERT(block_num_y % num_subgroups == 0); - const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE); - const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); + + constexpr size_t num_subgroups = WARP_SIZE; + const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups); + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); + stream->submit([&](sycl::handler & cgh) { - cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size), + cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q_reorder_ncols, ncols_dst>( vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, nd_item); @@ -1583,15 +1640,13 @@ static void reorder_mul_mat_vec_q5_k_q8_1_sycl(const void * vx, const void * vy, const int nrows, dpct::queue_ptr stream) { GGML_ASSERT(ncols % QK_K == 0); - const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y); - constexpr size_t num_subgroups = 16; - GGML_ASSERT(block_num_y % num_subgroups == 0); - - const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE); - const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); + constexpr size_t num_subgroups = WARP_SIZE; + const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups); + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); stream->submit([&](sycl::handler & cgh) { - cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size), + cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q_reorder>(vx, vy, dst, ncols, nrows, nd_item); @@ -1606,13 +1661,14 @@ static void reorder_mul_mat_vec_q5_k_q8_1_sycl_ncols( const int stride_col_y_bytes, const int stride_col_dst, dpct::queue_ptr stream) { GGML_ASSERT(ncols % QK_K == 0); - const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y); - constexpr size_t num_subgroups = 16; - GGML_ASSERT(block_num_y % num_subgroups == 0); - const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE); - const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); + + constexpr size_t num_subgroups = WARP_SIZE; + const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups); + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); + stream->submit([&](sycl::handler & cgh) { - cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size), + cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q_reorder_ncols, ncols_dst>( vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, nd_item); @@ -1643,13 +1699,13 @@ static void reorder_mul_mat_vec_q6_k_q8_1_sycl(const void * vx, const void * vy, GGML_ASSERT(ncols % QK_K == 0); // Round up to a whole number of subgroup-sized workgroups; out-of-range rows are skipped inside the kernel. constexpr size_t num_subgroups = WARP_SIZE; - const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups) * (int) num_subgroups; + const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups); + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); - const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE); - const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); stream->submit([&](sycl::handler & cgh) { - cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size), + cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q_reorder>(vx, vy, dst, ncols, nrows, nd_item); @@ -1664,13 +1720,13 @@ static void reorder_mul_mat_vec_q6_k_q8_1_sycl_ncols( const int stride_col_y_bytes, const int stride_col_dst, dpct::queue_ptr stream) { GGML_ASSERT(ncols % QK_K == 0); - const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y); - constexpr size_t num_subgroups = 16; - GGML_ASSERT(block_num_y % num_subgroups == 0); - const sycl::range<3> global_size(1, GGML_SYCL_MMV_Y, block_num_y * WARP_SIZE); - const sycl::range<3> workgroup_size(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); + constexpr size_t num_subgroups = WARP_SIZE; + const int block_num_y = ceil_div(nrows, GGML_SYCL_MMV_Y * (int) num_subgroups); + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, num_subgroups * WARP_SIZE); + stream->submit([&](sycl::handler & cgh) { - cgh.parallel_for(sycl::nd_range<3>(global_size, workgroup_size), + cgh.parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> nd_item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { mul_mat_vec_q_reorder_ncols, ncols_dst>( vx, vy, dst, ncols, nrows, stride_col_y_bytes, stride_col_dst, nd_item); @@ -2124,6 +2180,20 @@ void ggml_sycl_op_mul_mat_vec_q(ggml_backend_sycl_context & ctx, const ggml_tens mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); } break; + case GGML_TYPE_Q1_0: + if (i == 0 && src1_ncols > 1 && src1_ncols <= 8) { + const int stride_col_y = src1_padded_col_size / QK8_1; + const int stride_col_dst = dst->ne[0]; + GGML_SYCL_DEBUG("Calling mul_mat_vec_q1_0_q8_1_sycl_switch_ncols ncols=%d\n", (int)src1_ncols); + mul_mat_vec_q1_0_q8_1_sycl_switch_ncols( + src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, + src1_ncols, stride_col_y, stride_col_dst, stream); + return; + } else if (i == 0 || src1_ncols == 1) { + GGML_SYCL_DEBUG("Calling mul_mat_vec_q1_0_q8_1_sycl\n"); + mul_mat_vec_q1_0_q8_1_sycl(src0_dd_i, src1_ddq_i_bs, dst_dd_i_bs, ne00, row_diff, stream); + } + break; case GGML_TYPE_Q2_K: if (i == 0 && src1_ncols > 1 && src1_ncols <= 8) { const int stride_col_y = src1_padded_col_size / QK8_1; @@ -2472,3 +2542,118 @@ bool ggml_sycl_mul_mat_vec_q_id( return false; } } + +// Reorder (SoA) MoE expert GEMV: MoE expert/row/lane indexing (from mul_mat_vec_q_moe) with the +// dense-reorder per-block reads (from mul_mat_vec_q_reorder). Each expert slice in vx_base is a +// self-contained SoA, so nblocks = nrows*(ncols/qk) per expert and the constant expert stride holds. +template +static void mul_mat_vec_q_moe_reorder( + const void * __restrict__ vx_base, const void * __restrict__ vy_base, + float * __restrict__ dst_base, const int32_t * __restrict__ ids_dev, + const int ncols, const int nrows, + const size_t expert_weight_stride, const size_t dst_row_stride, + const size_t src1_row_stride, + const sycl::nd_item<3> & item_ct1) { + using block_type = ggml_sycl_reordered::block_q_t; + using block_traits = typename block_type::traits; + + const int expert_idx = item_ct1.get_group(1); + const int i02 = ids_dev[expert_idx]; + + const char * vx = (const char *) vx_base + (size_t) i02 * expert_weight_stride; + const char * vy = (const char *) vy_base + (size_t) expert_idx * src1_row_stride; + float * dst = (float *) ((char *) dst_base + (size_t) expert_idx * dst_row_stride); + + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + item_ct1.get_local_id(1); + if (row >= nrows) { + return; + } + + const auto sg = item_ct1.get_sub_group(); + + const int blocks_per_row = ncols / block_traits::qk; + constexpr int blocks_per_subgroup = ceil_div(block_traits::vdr_mmvq * WARP_SIZE, block_traits::qi); + constexpr int block_elements_per_subgroup = block_traits::qi / block_traits::vdr_mmvq; + const int nblocks = nrows * (ncols / block_traits::qk); + + static_assert(blocks_per_subgroup > 0); + static_assert(block_elements_per_subgroup > 0); + + float partial_sum = 0.0f; + for (int i = sg.get_local_linear_id() / block_elements_per_subgroup; i < blocks_per_row; i += blocks_per_subgroup) { + const int ibx = row * blocks_per_row + i; + + const auto bx_offset = block_type::get_block_offset(ibx, nblocks); + const auto d_offset = block_type::get_d_offset(nrows, ncols, ibx); + + const int iby = i * block_type::block_to_q8_1_ratio(); + const int8_t * q8_1_quant_ptr = (const int8_t *) vy + iby * QK8_1; + const sycl::half2 * q8_1_ds_ptr = (const sycl::half2 *) ((const char *) vy + ncols + iby * sizeof(sycl::half2)); + +#pragma unroll + for (int elem = 0; elem < block_elements_per_subgroup; elem += WARP_SIZE) { + const int iqs = elem + block_traits::vdr_mmvq * (sg.get_local_linear_id() % block_elements_per_subgroup); + partial_sum += reorder_vec_dot_q_sycl()(vx, bx_offset, d_offset, q8_1_quant_ptr, q8_1_ds_ptr, iqs); + } + } + + auto sum = sycl::reduce_over_group(sg, partial_sum, std::plus<>()); + if (sg.leader()) { + dst[row] = sum; + } +} + +template +static void launch_mul_mat_vec_q_moe_reorder( + const void * vx_base, const void * vy, const int32_t * ids_dev, + float * dst_base, const int ncols, const int nrows, const int n_experts_used, + const size_t expert_weight_stride, const size_t dst_row_stride, + const size_t src1_row_stride, + dpct::queue_ptr stream) { + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, (unsigned) n_experts_used, (unsigned) block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + stream->submit([&](sycl::handler & cgh) { + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { + mul_mat_vec_q_moe_reorder( + vx_base, vy, dst_base, ids_dev, ncols, nrows, + expert_weight_stride, dst_row_stride, src1_row_stride, item); + }); + }); +} + +bool ggml_sycl_mul_mat_vec_q_id_reorder( + enum ggml_type src0_type, + const void * vx_base, + const void * vy, + const int32_t * ids_dev, + float * dst_base, + int ncols, + int nrows, + int n_experts_used, + size_t expert_weight_stride, + size_t dst_row_stride, + size_t src1_row_stride, + dpct::queue_ptr stream) { + switch (src0_type) { + case GGML_TYPE_Q4_K: + launch_mul_mat_vec_q_moe_reorder>( + vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used, + expert_weight_stride, dst_row_stride, src1_row_stride, stream); + return true; + case GGML_TYPE_Q5_K: + launch_mul_mat_vec_q_moe_reorder>( + vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used, + expert_weight_stride, dst_row_stride, src1_row_stride, stream); + return true; + case GGML_TYPE_Q6_K: + launch_mul_mat_vec_q_moe_reorder>( + vx_base, vy, ids_dev, dst_base, ncols, nrows, n_experts_used, + expert_weight_stride, dst_row_stride, src1_row_stride, stream); + return true; + default: + return false; + } +} diff --git a/ggml/src/ggml-sycl/mmvq.hpp b/ggml/src/ggml-sycl/mmvq.hpp index d674dc1d61ea..c5d70bd0e2f8 100644 --- a/ggml/src/ggml-sycl/mmvq.hpp +++ b/ggml/src/ggml-sycl/mmvq.hpp @@ -40,4 +40,21 @@ bool ggml_sycl_mul_mat_vec_q_id( size_t src1_row_stride, // 0 = shared src1, else per-expert stride in bytes dpct::queue_ptr stream); +// Reorder (SoA) variant of the fused MoE expert GEMV. +// vx_base: each expert slice (stride expert_weight_stride == src0->nb[2]) is a self-contained reorder/SoA layout. +// vy: src1 quantized with quantize_and_reorder_q8_1_soa (per-row SoA). Returns false if src0_type isn't handled. +bool ggml_sycl_mul_mat_vec_q_id_reorder( + enum ggml_type src0_type, + const void * vx_base, + const void * vy, + const int32_t * ids_dev, + float * dst_base, + int ncols, + int nrows, + int n_experts_used, + size_t expert_weight_stride, + size_t dst_row_stride, + size_t src1_row_stride, + dpct::queue_ptr stream); + #endif // GGML_SYCL_MMVQ_HPP diff --git a/ggml/src/ggml-sycl/norm.cpp b/ggml/src/ggml-sycl/norm.cpp index 09fce1280adc..c4472e4bd66f 100644 --- a/ggml/src/ggml-sycl/norm.cpp +++ b/ggml/src/ggml-sycl/norm.cpp @@ -2,8 +2,10 @@ #include "ggml-sycl/common.hpp" #include "ggml-sycl/presets.hpp" -static void norm_f32(const float* x, float* dst, const int ncols, const int64_t stride_row, const int64_t stride_channel, - const int64_t stride_sample, const float eps, const sycl::nd_item<3>& item_ct1, sycl::float2* s_sum, int block_size) { +static void norm_f32(const float* x, float* dst, const int ncols, + const int64_t src_stride_col, const int64_t src_stride_row, const int64_t src_stride_channel, const int64_t src_stride_sample, + const int64_t dst_stride_col, const int64_t dst_stride_row, const int64_t dst_stride_channel, const int64_t dst_stride_sample, + const float eps, const sycl::nd_item<3>& item_ct1, sycl::float2* s_sum, int block_size) { const int nrows = item_ct1.get_group_range(2); const int nchannels = item_ct1.get_group_range(1); @@ -16,16 +18,16 @@ static void norm_f32(const float* x, float* dst, const int ncols, const int64_t const int tid = item_ct1.get_local_id(2); const int nwarps = nthreads / WARP_SIZE; - const auto strided_offset = calculate_offset<3>({stride_sample, stride_channel, stride_row}, {sample, channel, row}); - const auto packed_offset = calculate_offset<3>({nchannels * nrows * ncols, nrows * ncols, ncols}, {sample, channel, row}); + const auto src_offset = calculate_offset<3>({src_stride_sample, src_stride_channel, src_stride_row}, {sample, channel, row}); + const auto dst_offset = calculate_offset<3>({dst_stride_sample, dst_stride_channel, dst_stride_row}, {sample, channel, row}); - x += strided_offset; - dst += packed_offset; + x += src_offset; + dst += dst_offset; sycl::float2 mean_var = sycl::float2(0.f, 0.f); for (int col = tid; col < ncols; col += block_size) { - const float xi = x[col]; + const float xi = x[col * src_stride_col]; mean_var.x() += xi; mean_var.y() += xi * xi; } @@ -54,7 +56,7 @@ static void norm_f32(const float* x, float* dst, const int ncols, const int64_t const float inv_std = sycl::rsqrt(var + eps); for (int col = tid; col < ncols; col += block_size) { - dst[col] = (x[col] - mean) * inv_std; + dst[col * dst_stride_col] = (x[col * src_stride_col] - mean) * inv_std; } } @@ -145,8 +147,10 @@ static void group_norm_f32(const float* x, float* dst, const int group_size, con } } -static void rms_norm_f32(const float* x, float* dst, const int ncols, const int64_t stride_row, const int64_t stride_channel, - const int64_t stride_sample, const float eps, const sycl::nd_item<3>& item_ct1, float* s_sum, int block_size) { +static void rms_norm_f32(const float* x, float* dst, const int ncols, + const int64_t src_stride_col, const int64_t src_stride_row, const int64_t src_stride_channel, const int64_t src_stride_sample, + const int64_t dst_stride_col, const int64_t dst_stride_row, const int64_t dst_stride_channel, const int64_t dst_stride_sample, + const float eps, const sycl::nd_item<3>& item_ct1, float* s_sum, int block_size) { const int nrows = item_ct1.get_group_range(2); const int nchannels = item_ct1.get_group_range(1); @@ -160,17 +164,17 @@ static void rms_norm_f32(const float* x, float* dst, const int ncols, const int6 const int tid = item_ct1.get_local_id(2); const int nwarps = nthreads / WARP_SIZE; - const auto strided_offset = calculate_offset<3>({stride_sample, stride_channel, stride_row}, {sample, channel, row}); - const auto packed_offset = calculate_offset<3>({nchannels * nrows * ncols, nrows * ncols, ncols}, {sample, channel, row}); + const auto src_offset = calculate_offset<3>({src_stride_sample, src_stride_channel, src_stride_row}, {sample, channel, row}); + const auto dst_offset = calculate_offset<3>({dst_stride_sample, dst_stride_channel, dst_stride_row}, {sample, channel, row}); - x += strided_offset; - dst += packed_offset; + x += src_offset; + dst += dst_offset; float tmp = 0.0f; // partial sum for thread in warp for (int col = tid; col < ncols; col += block_size) { - const float xi = x[col]; + const float xi = x[col * src_stride_col]; tmp += xi * xi; } @@ -198,14 +202,15 @@ static void rms_norm_f32(const float* x, float* dst, const int ncols, const int6 const float scale = sycl::rsqrt(mean + eps); for (int col = tid; col < ncols; col += block_size) { - dst[col] = scale * x[col]; + dst[col * dst_stride_col] = scale * x[col * src_stride_col]; } } template static void l2_norm_f32(const float * x, float * dst, const int ncols, - const int64_t stride_row, const int64_t stride_channel, - const int64_t stride_sample, const float eps, + const int64_t src_stride_col, const int64_t src_stride_row, const int64_t src_stride_channel, + const int64_t src_stride_sample, const int64_t dst_stride_col, const int64_t dst_stride_row, + const int64_t dst_stride_channel, const int64_t dst_stride_sample, const float eps, const sycl::nd_item<3>& item_ct1, float* s_sum, const int block_size) { const int nrows = item_ct1.get_group_range(2); const int nchannels = item_ct1.get_group_range(1); @@ -215,13 +220,13 @@ static void l2_norm_f32(const float * x, float * dst, const int ncols, const int sample = item_ct1.get_group(0); const int tid = item_ct1.get_local_id(2); - x += sample*stride_sample + channel*stride_channel + row*stride_row; - dst += ((sample*nchannels + channel)*nrows + row)*ncols; + x += sample*src_stride_sample + channel*src_stride_channel + row*src_stride_row; + dst += sample*dst_stride_sample + channel*dst_stride_channel + row*dst_stride_row; float tmp = 0.0f; // partial sum for thread in warp for (int col = tid; col < ncols; col += block_size) { - const float xi = x[col]; + const float xi = x[col * src_stride_col]; tmp += xi * xi; } @@ -229,12 +234,13 @@ static void l2_norm_f32(const float * x, float * dst, const int ncols, const float scale = sycl::rsqrt(sycl::fmax(tmp, eps * eps)); for (int col = tid; col < ncols; col += block_size) { - dst[col] = scale * x[col]; + dst[col * dst_stride_col] = scale * x[col * src_stride_col]; } } static void norm_f32_sycl(const float * x, float * dst, const int ncols, const int nrows, const int nchannels, const int nsamples, - const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample, + const int64_t src_stride_col, const int64_t src_stride_row, const int64_t src_stride_channel, const int64_t src_stride_sample, + const int64_t dst_stride_col, const int64_t dst_stride_row, const int64_t dst_stride_channel, const int64_t dst_stride_sample, const float eps, queue_ptr stream, int device) { const sycl::range<3> global_dims(nsamples, nchannels, nrows); @@ -245,7 +251,10 @@ static void norm_f32_sycl(const float * x, float * dst, const int ncols, const i sycl::nd_range<3>(global_dims * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, nullptr, WARP_SIZE); + norm_f32(x, dst, ncols, + src_stride_col, src_stride_row, src_stride_channel, src_stride_sample, + dst_stride_col, dst_stride_row, dst_stride_channel, dst_stride_sample, + eps, item_ct1, nullptr, WARP_SIZE); }); }); } @@ -265,7 +274,10 @@ static void norm_f32_sycl(const float * x, float * dst, const int ncols, const i sycl::nd_range<3>(global_dims * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, get_pointer(s_sum_acc_ct1), work_group_size); + norm_f32(x, dst, ncols, + src_stride_col, src_stride_row, src_stride_channel, src_stride_sample, + dst_stride_col, dst_stride_row, dst_stride_channel, dst_stride_sample, + eps, item_ct1, get_pointer(s_sum_acc_ct1), work_group_size); }); }); } @@ -319,7 +331,9 @@ static void group_norm_f32_sycl(const float* x, float* dst, } static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, const int nrows, const int nchannels, const int nsamples, - const int64_t stride_row, const int64_t stride_channel, const int64_t stride_sample, const float eps, queue_ptr stream, int device) { + const int64_t src_stride_col, const int64_t src_stride_row, const int64_t src_stride_channel, const int64_t src_stride_sample, + const int64_t dst_stride_col, const int64_t dst_stride_row, const int64_t dst_stride_channel, const int64_t dst_stride_sample, + const float eps, queue_ptr stream, int device) { // printf("%s ncols=%d, nrows=%d, WARP_SIZE=%d\n", __func__, ncols, nrows, WARP_SIZE); const sycl::range<3> global_dims(nsamples, nchannels, nrows); @@ -330,7 +344,10 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, const sycl::nd_range<3>(global_dims * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - rms_norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, nullptr, WARP_SIZE); + rms_norm_f32(x, dst, ncols, + src_stride_col, src_stride_row, src_stride_channel, src_stride_sample, + dst_stride_col, dst_stride_row, dst_stride_channel, dst_stride_sample, + eps, item_ct1, nullptr, WARP_SIZE); }); }); } @@ -350,7 +367,10 @@ static void rms_norm_f32_sycl(const float* x, float* dst, const int ncols, const sycl::nd_range<3>(global_dims * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { - rms_norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, get_pointer(s_sum_acc_ct1), work_group_size); + rms_norm_f32(x, dst, ncols, + src_stride_col, src_stride_row, src_stride_channel, src_stride_sample, + dst_stride_col, dst_stride_row, dst_stride_channel, dst_stride_sample, + eps, item_ct1, get_pointer(s_sum_acc_ct1), work_group_size); }); }); } @@ -363,9 +383,14 @@ static void l2_norm_f32_sycl(const float * x, const int nrows, const int nchannels, const int nsamples, - const int64_t stride_row, - const int64_t stride_channel, - const int64_t stride_sample, + const int64_t src_stride_col, + const int64_t src_stride_row, + const int64_t src_stride_channel, + const int64_t src_stride_sample, + const int64_t dst_stride_col, + const int64_t dst_stride_row, + const int64_t dst_stride_channel, + const int64_t dst_stride_sample, const float eps, queue_ptr stream, int device) { @@ -379,7 +404,10 @@ static void l2_norm_f32_sycl(const float * x, block_dims), [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(warp_size)]] { - l2_norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, eps, item_ct1, + l2_norm_f32(x, dst, ncols, + src_stride_col, src_stride_row, src_stride_channel, src_stride_sample, + dst_stride_col, dst_stride_row, dst_stride_channel, dst_stride_sample, + eps, item_ct1, nullptr, warp_size); }); }); @@ -398,7 +426,9 @@ static void l2_norm_f32_sycl(const float * x, block_dims), [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(warp_size)]] { - l2_norm_f32(x, dst, ncols, stride_row, stride_channel, stride_sample, + l2_norm_f32(x, dst, ncols, + src_stride_col, src_stride_row, src_stride_channel, src_stride_sample, + dst_stride_col, dst_stride_row, dst_stride_channel, dst_stride_sample, eps, item_ct1, get_pointer(s_sum_acc_ct1), work_group_size); }); }); @@ -421,12 +451,20 @@ void ggml_sycl_op_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) { memcpy(&eps, dst->op_params, sizeof(float)); GGML_ASSERT(eps >= 0.0f); const size_t ts0 = ggml_type_size(src0->type); - GGML_ASSERT(nb00 == ts0); - const int64_t s01 = nb01 / ts0; - const int64_t s02 = nb02 / ts0; - const int64_t s03 = nb03 / ts0; - - norm_f32_sycl(src0_dd, dst_dd, ne00, ne01, ne02, ne03, s01, s02, s03, eps, main_stream, ctx.device); + const size_t tdst = ggml_type_size(dst->type); + GGML_ASSERT(nb00 % ts0 == 0 && nb01 % ts0 == 0 && nb02 % ts0 == 0 && nb03 % ts0 == 0); + GGML_ASSERT(nb0 % tdst == 0 && nb1 % tdst == 0 && nb2 % tdst == 0 && nb3 % tdst == 0); + const int64_t ss0 = nb00 / ts0; + const int64_t ss1 = nb01 / ts0; + const int64_t ss2 = nb02 / ts0; + const int64_t ss3 = nb03 / ts0; + const int64_t ds0 = nb0 / tdst; + const int64_t ds1 = nb1 / tdst; + const int64_t ds2 = nb2 / tdst; + const int64_t ds3 = nb3 / tdst; + + norm_f32_sycl(src0_dd, dst_dd, ne00, ne01, ne02, ne03, + ss0, ss1, ss2, ss3, ds0, ds1, ds2, ds3, eps, main_stream, ctx.device); } void ggml_sycl_op_group_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) { @@ -465,11 +503,19 @@ void ggml_sycl_op_rms_norm(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_TENSOR_UNARY_OP_LOCALS const size_t ts0 = ggml_type_size(src0->type); - GGML_ASSERT(nb00 == ts0); - const int64_t s01 = nb01 / ts0; - const int64_t s02 = nb02 / ts0; - const int64_t s03 = nb03 / ts0; - rms_norm_f32_sycl(src0_dd, dst_dd, ne00, ne01, ne02, ne03, s01, s02, s03, eps, main_stream, ctx.device); + const size_t tdst = ggml_type_size(dst->type); + GGML_ASSERT(nb00 % ts0 == 0 && nb01 % ts0 == 0 && nb02 % ts0 == 0 && nb03 % ts0 == 0); + GGML_ASSERT(nb0 % tdst == 0 && nb1 % tdst == 0 && nb2 % tdst == 0 && nb3 % tdst == 0); + const int64_t ss0 = nb00 / ts0; + const int64_t ss1 = nb01 / ts0; + const int64_t ss2 = nb02 / ts0; + const int64_t ss3 = nb03 / ts0; + const int64_t ds0 = nb0 / tdst; + const int64_t ds1 = nb1 / tdst; + const int64_t ds2 = nb2 / tdst; + const int64_t ds3 = nb3 / tdst; + rms_norm_f32_sycl(src0_dd, dst_dd, ne00, ne01, ne02, ne03, + ss0, ss1, ss2, ss3, ds0, ds1, ds2, ds3, eps, main_stream, ctx.device); } void ggml_sycl_op_rms_norm_back(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { @@ -644,13 +690,21 @@ void ggml_sycl_op_l2_norm(ggml_backend_sycl_context& ctx, ggml_tensor* dst) { GGML_ASSERT(eps >= 0.0f); const size_t ts0 = ggml_type_size(src0->type); - GGML_ASSERT(nb00 == ts0); - const int64_t s01 = nb01 / ts0; - const int64_t s02 = nb02 / ts0; - const int64_t s03 = nb03 / ts0; + const size_t tdst = ggml_type_size(dst->type); + GGML_ASSERT(nb00 % ts0 == 0 && nb01 % ts0 == 0 && nb02 % ts0 == 0 && nb03 % ts0 == 0); + GGML_ASSERT(nb0 % tdst == 0 && nb1 % tdst == 0 && nb2 % tdst == 0 && nb3 % tdst == 0); + const int64_t ss0 = nb00 / ts0; + const int64_t ss1 = nb01 / ts0; + const int64_t ss2 = nb02 / ts0; + const int64_t ss3 = nb03 / ts0; + const int64_t ds0 = nb0 / tdst; + const int64_t ds1 = nb1 / tdst; + const int64_t ds2 = nb2 / tdst; + const int64_t ds3 = nb3 / tdst; /*support both WARP_SIZE or WARP_32_SIZE in code choose by hardware for better performance */ - l2_norm_f32_sycl(src0_d, dst_d, ne00, ne01, ne02, ne03, s01, s02, s03, eps, stream, ctx.device); + l2_norm_f32_sycl(src0_d, dst_d, ne00, ne01, ne02, ne03, + ss0, ss1, ss2, ss3, ds0, ds1, ds2, ds3, eps, stream, ctx.device); } diff --git a/ggml/src/ggml-sycl/outprod.cpp b/ggml/src/ggml-sycl/outprod.cpp index f52b11f0d6ed..8d10dad8c975 100644 --- a/ggml/src/ggml-sycl/outprod.cpp +++ b/ggml/src/ggml-sycl/outprod.cpp @@ -1,11 +1,12 @@ #include "outprod.hpp" +#include "convert.hpp" void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, ggml_tensor* dst) { scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2); const ggml_tensor *src0 = dst->src[0]; const ggml_tensor *src1 = dst->src[1]; - GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_Q1_0); GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); GGML_ASSERT(ggml_is_contiguous(src0)); @@ -20,11 +21,31 @@ void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, ggml_tensor* dst) { GGML_ASSERT(ne01 == ne11); // Inner dimensions must match GGML_ASSERT(ne0 == ne00); // Output rows match src0 rows GGML_ASSERT(ne1 == ne10); // Output cols match src1 cols + GGML_ASSERT(ne2 == ne12); + GGML_ASSERT(ne3 == ne13); + GGML_ASSERT(ne2 % ne02 == 0); + GGML_ASSERT(ne3 % ne03 == 0); // Get data pointers - const float* src0_d = (const float*)src0->data; - const float* src1_d = (const float*)src1->data; - float* dst_d = (float*)dst->data; + const float * src0_d = (const float *) src0->data; + const float * src1_d = (const float *) src1->data; + float * dst_d = (float *) dst->data; + + ggml_sycl_pool_alloc src0_as_f32(ctx.pool()); + int64_t src0_nb02 = nb02; + int64_t src0_nb03 = nb03; + if (src0->type == GGML_TYPE_Q1_0) { + scope_op_debug_print scope_dbg_print(__func__, "/to_fp32_sycl", dst, /*num_src=*/2, + " : converting src0 Q1_0 to fp32"); + src0_d = src0_as_f32.alloc(ne00 * ne01 * ne02 * ne03); + const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(src0->type, dst); + GGML_ASSERT(to_fp32_sycl != nullptr); + to_fp32_sycl(src0->data, const_cast(src0_d), ne00 * ne01 * ne02 * ne03, stream); + + // Dequantized src0 buffer is contiguous fp32 [ne00, ne01, ne02, ne03]. + src0_nb02 = ne00 * ne01 * (int64_t) sizeof(float); + src0_nb03 = ne00 * ne01 * ne02 * (int64_t) sizeof(float); + } // GEMM parameters const float alpha = 1.0f; @@ -35,12 +56,27 @@ void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, ggml_tensor* dst) { const oneapi::mkl::transpose src1_op = src1_T ? oneapi::mkl::transpose::nontrans : oneapi::mkl::transpose::trans; const int64_t ldb = (src1_T ? nb10 : nb11) / sizeof(float); + const int64_t r2 = ne2 / ne02; + const int64_t r3 = ne3 / ne03; + try { - // Perform matrix multiplication using oneMKL GEMM - oneapi::mkl::blas::column_major::gemm(*stream, oneapi::mkl::transpose::nontrans, src1_op, - ne0, ne1, ne01, alpha, src0_d, ne00, src1_d, ldb, beta, dst_d, ne0); - } - catch (sycl::exception const& exc) { + // OUT_PROD applies independently to each (i2, i3) destination plane. + for (int64_t i3 = 0; i3 < ne3; ++i3) { + for (int64_t i2 = 0; i2 < ne2; ++i2) { + const int64_t i03 = i3 / r3; + const int64_t i02 = i2 / r2; + + const float * src0_plane = (const float *) ((const char *) src0_d + i02 * src0_nb02 + i03 * src0_nb03); + const float * src1_plane = (const float *) ((const char *) src1_d + i2 * nb12 + i3 * nb13); + float * dst_plane = (float *) ((char *) dst_d + i2 * nb2 + i3 * nb3); + + // Perform matrix multiplication using oneMKL GEMM + oneapi::mkl::blas::column_major::gemm(*stream, oneapi::mkl::transpose::nontrans, src1_op, + ne0, ne1, ne01, alpha, src0_plane, ne00, + src1_plane, ldb, beta, dst_plane, ne0); + } + } + } catch (sycl::exception const& exc) { std::cerr << exc.what() << std::endl; GGML_ASSERT(false); } diff --git a/ggml/src/ggml-sycl/pool.cpp b/ggml/src/ggml-sycl/pool.cpp new file mode 100644 index 000000000000..de704309f246 --- /dev/null +++ b/ggml/src/ggml-sycl/pool.cpp @@ -0,0 +1,185 @@ +// +// MIT license +// Copyright (C) 2026 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#include "pool.hpp" +#include + +template +static void pool2d_nchw_kernel( + const int ih, const int iw, const int oh, const int ow, + const int kh, const int kw, const int sh, const int sw, + const int ph, const int pw, const int parallel_elements, + const Ti* src, To* dst, const enum ggml_op_pool op, + const sycl::nd_item<3> &item_ct1) { + int idx = item_ct1.get_local_id(2) + + item_ct1.get_group(2) * item_ct1.get_local_range(2); + if (idx >= parallel_elements) { + return; + } + + const int I_HW = ih * iw; + const int O_HW = oh * ow; + const int nc = idx / O_HW; + const int cur_oh = idx % O_HW / ow; + const int cur_ow = idx % O_HW % ow; + const Ti* i_ptr = src + nc * I_HW; + To* o_ptr = dst + nc * O_HW; + const int start_h = cur_oh * sh - ph; + const int bh = sycl::max(0, start_h); + const int eh = sycl::min(ih, start_h + kh); + const int start_w = cur_ow * sw - pw; + const int bw = sycl::max(0, start_w); + const int ew = sycl::min(iw, start_w + kw); + + To res = 0; + + switch (op) { + case GGML_OP_POOL_AVG: res = 0; break; + case GGML_OP_POOL_MAX: res = -FLT_MAX; break; + default: + res = (To) sycl::nan(uint32_t(0)); + break; + } + + for (int i = bh; i < eh; i += 1) { + for (int j = bw; j < ew; j += 1) { + Ti cur = i_ptr[i * iw + j]; + switch (op) { + case GGML_OP_POOL_AVG: res += (cur / (kh * kw)); break; + case GGML_OP_POOL_MAX: res = sycl::max(res, (To)cur); break; + default: + res = (To) sycl::nan(uint32_t(0)); + break; + } + } + } + o_ptr[cur_oh * ow + cur_ow] = res; +} + +template +static void pool1d_ncw_kernel( + const int iw, const int ow, + const int k, const int s, + const int p, const int parallel_elements, + const Ti * src, To * dst, const enum ggml_op_pool op, + const sycl::nd_item<3> & item_ct1) { + int idx = item_ct1.get_local_id(2) + + item_ct1.get_group(2) * item_ct1.get_local_range(2); + if (idx >= parallel_elements) { + return; + } + + const int nc = idx / ow; + const int cur_ow = idx % ow; + const Ti * i_ptr = src + nc * iw; + To * o_ptr = dst + nc * ow; + const int start = cur_ow * s - p; + const int b = sycl::max(0, start); + const int e = sycl::min(iw, start + k); + + To res = 0; + switch (op) { + case GGML_OP_POOL_AVG: res = 0; break; + case GGML_OP_POOL_MAX: res = -FLT_MAX; break; + default: + res = (To) sycl::nan(uint32_t(0)); + break; + } + + for (int j = b; j < e; j += 1) { + Ti cur = i_ptr[j]; + switch (op) { + case GGML_OP_POOL_AVG: res += cur; break; + case GGML_OP_POOL_MAX: res = sycl::max(res, (To) cur); break; + default: + res = (To) sycl::nan(uint32_t(0)); + break; + } + } + + const int count = e - b; + if (op == GGML_OP_POOL_AVG) { + res = (count > 0) ? (res / count) : (To) 0; + } + o_ptr[cur_ow] = res; +} + +void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); + + const int32_t * opts = (const int32_t *)dst->op_params; + enum ggml_op_pool op = static_cast(opts[0]); + const int k0 = opts[1]; + const int k1 = opts[2]; + const int s0 = opts[3]; + const int s1 = opts[4]; + const int p0 = opts[5]; + const int p1 = opts[6]; + + const int64_t IH = dst->src[0]->ne[1]; + const int64_t IW = dst->src[0]->ne[0]; + + const int64_t N = dst->ne[3]; + const int64_t OC = dst->ne[2]; + const int64_t OH = dst->ne[1]; + const int64_t OW = dst->ne[0]; + + const int parallel_elements = N * OC * OH * OW; + const int num_blocks = (parallel_elements + SYCL_POOL2D_BLOCK_SIZE - 1) / SYCL_POOL2D_BLOCK_SIZE; + sycl::range<3> block_nums(1, 1, num_blocks); + main_stream->parallel_for( + sycl::nd_range<3>(block_nums * + sycl::range<3>(1, 1, SYCL_IM2COL_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_IM2COL_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + pool2d_nchw_kernel(IH, IW, OH, OW, k1, k0, s1, s0, p1, p0, + parallel_elements, src0_dd, dst_dd, op, + item_ct1); + }); +} + +void ggml_sycl_op_pool1d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + dpct::queue_ptr main_stream = ctx.stream(); + SYCL_CHECK(ggml_sycl_set_device(ctx.device)); + const float * src0_dd = static_cast(dst->src[0]->data); + float * dst_dd = static_cast(dst->data); + + const int32_t * opts = (const int32_t *)dst->op_params; + enum ggml_op_pool op = static_cast(opts[0]); + const int k0 = opts[1]; + const int s0 = opts[2]; + const int p0 = opts[3]; + + const int64_t IW = dst->src[0]->ne[0]; + const int64_t OW = dst->ne[0]; + const int64_t NC = dst->ne[3] * dst->ne[2] * dst->ne[1]; + + const int parallel_elements = NC * OW; + const int num_blocks = (parallel_elements + SYCL_POOL1D_BLOCK_SIZE - 1) / SYCL_POOL1D_BLOCK_SIZE; + sycl::range<3> block_nums(1, 1, num_blocks); + main_stream->parallel_for( + sycl::nd_range<3>(block_nums * + sycl::range<3>(1, 1, SYCL_POOL1D_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_POOL1D_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + pool1d_ncw_kernel(IW, OW, k0, s0, p0, + parallel_elements, src0_dd, dst_dd, op, + item_ct1); + }); +} diff --git a/ggml/src/ggml-sycl/pool.hpp b/ggml/src/ggml-sycl/pool.hpp new file mode 100644 index 000000000000..a1790449cea4 --- /dev/null +++ b/ggml/src/ggml-sycl/pool.hpp @@ -0,0 +1,22 @@ +// +// MIT license +// Copyright (C) 2026 Intel Corporation +// SPDX-License-Identifier: MIT +// + +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// + +#ifndef GGML_SYCL_POOL_HPP +#define GGML_SYCL_POOL_HPP + +#include "common.hpp" +#include "presets.hpp" + +void ggml_sycl_op_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst); +void ggml_sycl_op_pool1d(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +#endif // GGML_SYCL_POOL_HPP diff --git a/ggml/src/ggml-sycl/presets.hpp b/ggml/src/ggml-sycl/presets.hpp index dc4dad1d37a8..54566316ad1b 100644 --- a/ggml/src/ggml-sycl/presets.hpp +++ b/ggml/src/ggml-sycl/presets.hpp @@ -46,6 +46,7 @@ #define SYCL_PAD_BLOCK_SIZE 256 #define SYCL_ACC_BLOCK_SIZE 256 #define SYCL_IM2COL_BLOCK_SIZE 256 +#define SYCL_POOL1D_BLOCK_SIZE 256 #define SYCL_POOL2D_BLOCK_SIZE 256 #define SYCL_ARGMAX_BLOCK_SIZE 256 #define SYCL_CONV_TRANPOSE_1D_BLOCK_SIZE 256 diff --git a/ggml/src/ggml-sycl/set_rows.cpp b/ggml/src/ggml-sycl/set_rows.cpp index 8fb41943525c..5fb97790712d 100644 --- a/ggml/src/ggml-sycl/set_rows.cpp +++ b/ggml/src/ggml-sycl/set_rows.cpp @@ -135,7 +135,7 @@ static void set_rows_sycl( stream->parallel_for( sycl::nd_range<1>(grid_size * block_size, block_size), - [=](sycl::nd_item<1> item_ct1) { + [=](sycl::nd_item<1> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] { k_set_rows( src0_d, src1_d, dst_d, ne00, ne01, ne02, @@ -202,6 +202,9 @@ static void set_rows_sycl(ggml_backend_sycl_context & ctx, const ggml_tensor * s case GGML_TYPE_Q8_0: set_rows_sycl_q(src0_d, src1_d, (block_q8_0 *)dst->data, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb1, nb2, nb3, stream); break; + case GGML_TYPE_Q1_0: + set_rows_sycl_q(src0_d, src1_d, (block_q1_0 *)dst->data, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb1, nb2, nb3, stream); + break; case GGML_TYPE_Q5_1: set_rows_sycl_q(src0_d, src1_d, (block_q5_1 *)dst->data, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb1, nb2, nb3, stream); break; @@ -217,7 +220,12 @@ static void set_rows_sycl(ggml_backend_sycl_context & ctx, const ggml_tensor * s case GGML_TYPE_IQ4_NL: set_rows_sycl_q(src0_d, src1_d, (block_iq4_nl *)dst->data, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb1, nb2, nb3, stream); break; - + case GGML_TYPE_MXFP4: + set_rows_sycl_q(src0_d, src1_d, (block_mxfp4 *)dst->data, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb1, nb2, nb3, stream); + break; + case GGML_TYPE_NVFP4: + set_rows_sycl_q(src0_d, src1_d, (block_nvfp4 *)dst->data, ne00, ne01, ne02, ne03, ne10, ne11, ne12, ne13, nb00, nb01, nb02, nb03, nb10, nb11, nb12, nb13, nb1, nb2, nb3, stream); + break; default: GGML_ABORT("Unsupported tensor type!"); break; diff --git a/ggml/src/ggml-sycl/softmax.cpp b/ggml/src/ggml-sycl/softmax.cpp index fdf9b843e015..67ea282b4b30 100644 --- a/ggml/src/ggml-sycl/softmax.cpp +++ b/ggml/src/ggml-sycl/softmax.cpp @@ -56,7 +56,7 @@ static void soft_max_f32(const float * x, : block_size_template; const int nthreads = block_size; const int nwarps = nthreads / WARP_SIZE; - size_t nreduce = nwarps / WARP_SIZE; + const size_t nreduce = nwarps / WARP_SIZE; const int tid = item_ct1.get_local_id(2); @@ -105,17 +105,15 @@ static void soft_max_f32(const float * x, max_val = warp_reduce_max(max_val); if (block_size > WARP_SIZE) { - if (warp_id == 0) { - buf_iw[lane_id] = -INFINITY; - } - item_ct1.barrier(); - if (lane_id == 0) { buf_iw[warp_id] = max_val; } item_ct1.barrier(); - max_val = buf_iw[lane_id]; + max_val = -INFINITY; + for (int i = lane_id; i < nwarps; i += WARP_SIZE) { + max_val = sycl::max(max_val, buf_iw[i]); + } max_val = warp_reduce_max(max_val); } float tmp = 0.0f; // partial sum @@ -128,7 +126,7 @@ static void soft_max_f32(const float * x, break; } - const float val = sycl::native::exp(vals[col] - max_val); + const float val = sycl::native::exp(sycl::max(vals[col] - max_val, -80.0f)); tmp += val; vals[col] = val; } @@ -156,7 +154,7 @@ static void soft_max_f32(const float * x, tmp = warp_reduce_sum(tmp); } if (sinks) { - tmp += sycl::native::exp(sinks[i02] - max_val); + tmp += sycl::native::exp(sycl::max(sinks[i02] - max_val, -80.0f)); } const float inv_sum = 1.0f / tmp; @@ -290,7 +288,8 @@ static void soft_max_f32_sycl(const float *x, const T *mask, cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { + [=](sycl::nd_item<3> item_ct1) + [[sycl::reqd_sub_group_size(WARP_SIZE)]] { soft_max_f32( x, mask, sinks, dst, params, dpct_local_acc_ct1 diff --git a/ggml/src/ggml-sycl/vecdotq.hpp b/ggml/src/ggml-sycl/vecdotq.hpp index 4b58b09ab2c5..765fb7f1590b 100644 --- a/ggml/src/ggml-sycl/vecdotq.hpp +++ b/ggml/src/ggml-sycl/vecdotq.hpp @@ -309,6 +309,41 @@ vec_dot_q6_K_q8_1_impl_mmvq(const int &vl, const int &vh, vl, vh, u[0], u[1], scales[0], scales[4], d, d8[0], d8[1]); } +#define VDR_Q1_0_Q8_1_MMVQ 1 +#define VDR_Q1_0_Q8_1_MMQ 4 + +static __dpct_inline__ float +vec_dot_q1_0_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs) { + + const block_q1_0 * bq1_0 = (const block_q1_0 *) vbq; + + const block_q8_1 * bq8_1_chunk = bq8_1 + iqs; + const float d1 = bq1_0->d; + const int v = get_int_from_uint8_aligned(bq1_0->qs, iqs); + + int vi_bytes[8]; +#pragma unroll + for (int j = 0; j < 8; ++j) { + const int shift = j * 4; + const int bits4 = (v >> shift) & 0x0F; + const int b0 = (bits4 & 0x01) ? 1 : -1; + const int b1 = (bits4 & 0x02) ? 1 : -1; + const int b2 = (bits4 & 0x04) ? 1 : -1; + const int b3 = (bits4 & 0x08) ? 1 : -1; + vi_bytes[j] = (b0 & 0xFF) | ((b1 & 0xFF) << 8) | ((b2 & 0xFF) << 16) | ((b3 & 0xFF) << 24); + } + + int sumi = 0; +#pragma unroll + for (int j = 0; j < 8; ++j) { + const int u = get_int_from_int8_aligned(bq8_1_chunk->qs, j); + sumi = ggml_sycl_dp4a(vi_bytes[j], u, sumi); + } + + return d1 * bq8_1_chunk->ds[0] * sumi; +} + // VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called // MMVQ = mul_mat_vec_q, MMQ = mul_mat_q diff --git a/ggml/src/ggml-vulkan/CMakeLists.txt b/ggml/src/ggml-vulkan/CMakeLists.txt index 2d9e85794ad4..5aeb6e97b159 100644 --- a/ggml/src/ggml-vulkan/CMakeLists.txt +++ b/ggml/src/ggml-vulkan/CMakeLists.txt @@ -108,6 +108,9 @@ if (Vulkan_FOUND) if (GGML_VULKAN_CHECK_RESULTS) add_compile_definitions(GGML_VULKAN_CHECK_RESULTS) + # the result-checking path computes a CPU reference graph via + # ggml_graph_compute_with_ctx(), which is defined in ggml-cpu + target_link_libraries(ggml-vulkan PRIVATE ggml-cpu) endif() if (GGML_VULKAN_DEBUG) @@ -129,6 +132,8 @@ if (Vulkan_FOUND) if (GGML_VULKAN_RUN_TESTS) add_compile_definitions(GGML_VULKAN_RUN_TESTS) + # the test path also calls ggml_graph_compute_with_ctx() (ggml-cpu) + target_link_libraries(ggml-vulkan PRIVATE ggml-cpu) endif() # Set up toolchain for host compilation whether cross-compiling or not diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 387826b6d932..ea9191873282 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -308,6 +308,7 @@ enum vk_device_architecture { AMD_RDNA1, AMD_RDNA2, AMD_RDNA3, + INTEL_XE1, INTEL_XE2, NVIDIA_PRE_TURING, NVIDIA_TURING, @@ -365,21 +366,26 @@ static vk_device_architecture get_device_architecture(const vk::PhysicalDevice& const std::vector ext_props = device.enumerateDeviceExtensionProperties(); bool subgroup_size_control = false; + bool integer_dot_product = false; for (const auto& properties : ext_props) { if (strcmp("VK_EXT_subgroup_size_control", properties.extensionName) == 0) { subgroup_size_control = true; + } else if (strcmp("VK_KHR_shader_integer_dot_product", properties.extensionName) == 0) { + integer_dot_product = true; } } - if (!subgroup_size_control) { + if (!subgroup_size_control || !integer_dot_product) { return vk_device_architecture::OTHER; } vk::PhysicalDeviceProperties2 props2; vk::PhysicalDeviceSubgroupSizeControlPropertiesEXT subgroup_size_control_props; + vk::PhysicalDeviceShaderIntegerDotProductPropertiesKHR integer_dot_props; props2.pNext = &subgroup_size_control_props; + subgroup_size_control_props.pNext = &integer_dot_props; device.getProperties2(&props2); if (subgroup_size_control_props.minSubgroupSize == 16) { @@ -388,6 +394,9 @@ static vk_device_architecture get_device_architecture(const vk::PhysicalDevice& // https://www.intel.com/content/www/us/en/content-details/824434/2024-intel-tech-tour-xe2-and-lunar-lake-s-gpu.html // https://www.intel.com/content/www/us/en/docs/oneapi/optimization-guide-gpu/2025-0/intel-xe-gpu-architecture.html return vk_device_architecture::INTEL_XE2; + } else if (subgroup_size_control_props.minSubgroupSize == 8 && + integer_dot_product && integer_dot_props.integerDotProduct4x8BitPackedSignedAccelerated) { + return vk_device_architecture::INTEL_XE1; } } else if (props.vendorID == VK_VENDOR_ID_NVIDIA) { const std::vector ext_props = device.enumerateDeviceExtensionProperties(); @@ -493,6 +502,20 @@ struct vk_conv2d_pipeline_state { } }; +struct vk_conv3d_pipeline_state { + vk_conv3d_pipeline_state(uint32_t s0, uint32_t s1, uint32_t s2, uint32_t p0, uint32_t p1, uint32_t p2, + uint32_t d0, uint32_t d1, uint32_t d2, uint32_t KW, uint32_t KH, uint32_t KD, uint32_t aligned) + : s0(s0), s1(s1), s2(s2), p0(p0), p1(p1), p2(p2), d0(d0), d1(d1), d2(d2), KW(KW), KH(KH), KD(KD), aligned(aligned) {} + + uint32_t s0, s1, s2, p0, p1, p2, d0, d1, d2, KW, KH, KD; + uint32_t aligned; + + bool operator<(const vk_conv3d_pipeline_state &b) const { + return std::tie(s0, s1, s2, p0, p1, p2, d0, d1, d2, KW, KH, KD, aligned) < + std::tie(b.s0, b.s1, b.s2, b.p0, b.p1, b.p2, b.d0, b.d1, b.d2, b.KW, b.KH, b.KD, b.aligned); + } +}; + struct vk_solve_tri_pipeline_state { vk_solve_tri_pipeline_state(uint32_t N, uint32_t K) : N(N), K(K) {} @@ -685,6 +708,7 @@ struct vk_device_struct { bool add_rms_fusion; uint32_t partials_binding_alignment; + uint32_t max_nodes_per_submit; bool shader_64b_indexing; @@ -777,6 +801,7 @@ struct vk_device_struct { vk_pipeline pipeline_mul_mat_vec_nc_f16_f32; vk_pipeline pipeline_get_rows[GGML_TYPE_COUNT]; vk_pipeline pipeline_get_rows_f32[GGML_TYPE_COUNT]; + vk_pipeline pipeline_get_rows_back_f32; vk_pipeline pipeline_acc_f32; vk_pipeline pipeline_set_f32; @@ -798,17 +823,13 @@ struct vk_device_struct { vk_pipeline pipeline_add_id_f32; - vk_pipeline pipeline_concat_f32, pipeline_concat_f16, pipeline_concat_i32; + vk_pipeline pipeline_concat_i8, pipeline_concat_i16, pipeline_concat_i32, pipeline_concat_i64; vk_pipeline pipeline_upscale_nearest_f32, pipeline_upscale_bilinear_f32, pipeline_upscale_bicubic_f32, pipeline_upscale_bilinear_antialias_f32; vk_pipeline pipeline_scale_f32; - vk_pipeline pipeline_sqr_f32; - vk_pipeline pipeline_sqrt_f32; - vk_pipeline pipeline_sin_f32; - vk_pipeline pipeline_cos_f32; vk_pipeline pipeline_log[2]; vk_pipeline pipeline_tri[2]; vk_pipeline pipeline_diag[2]; - vk_pipeline pipeline_clamp_f32; + vk_pipeline pipeline_clamp[2]; vk_pipeline pipeline_pad_f32; vk_pipeline pipeline_roll_f32; vk_pipeline pipeline_repeat_i32, pipeline_repeat_back_f32; @@ -833,12 +854,17 @@ struct vk_device_struct { // [src/dst 0=fp32,1=fp16] vk_pipeline pipeline_exp[2]; + vk_pipeline pipeline_expm1[2]; vk_pipeline pipeline_elu[2]; vk_pipeline pipeline_gelu[2]; vk_pipeline pipeline_gelu_erf[2]; vk_pipeline pipeline_gelu_quick[2]; vk_pipeline pipeline_silu[2]; vk_pipeline pipeline_relu[2]; + vk_pipeline pipeline_sqr[2]; + vk_pipeline pipeline_sqrt[2]; + vk_pipeline pipeline_sin[2]; + vk_pipeline pipeline_cos[2]; vk_pipeline pipeline_xielu[2]; vk_pipeline pipeline_neg[2]; vk_pipeline pipeline_tanh[2]; @@ -870,7 +896,7 @@ struct vk_device_struct { vk_pipeline pipeline_geglu_erf[2]; vk_pipeline pipeline_geglu_quick[2]; - vk_pipeline pipeline_leaky_relu_f32; + vk_pipeline pipeline_leaky_relu[2]; vk_pipeline pipeline_silu_back_f32; vk_pipeline pipeline_diag_mask_inf_f32; vk_pipeline pipeline_soft_max_f32, pipeline_soft_max_f32_f16; @@ -901,14 +927,17 @@ struct vk_device_struct { vk_pipeline pipeline_im2col_3d_f32, pipeline_im2col_3d_f32_f16; vk_pipeline pipeline_timestep_embedding_f32; vk_pipeline pipeline_conv_transpose_1d_f32; + vk_pipeline pipeline_col2im_1d_f32; + vk_pipeline pipeline_col2im_1d_f16; + vk_pipeline pipeline_col2im_1d_bf16; vk_pipeline pipeline_snake_f32; vk_pipeline pipeline_snake_f16; vk_pipeline pipeline_snake_bf16; vk_pipeline pipeline_pool2d_f32; vk_pipeline pipeline_rwkv_wkv6_f32; vk_pipeline pipeline_rwkv_wkv7_f32; - // [size_idx][kda] where size_idx: 0=d32, 1=d64, 2=d128 - vk_pipeline pipeline_gated_delta_net[3][2]; + // [size_idx][kda] where size_idx: 0=d16, 1=d32, 2=d64, 3=d128 + vk_pipeline pipeline_gated_delta_net[4][2]; vk_pipeline pipeline_ssm_scan_f32_d128; vk_pipeline pipeline_ssm_scan_f32_d256; vk_pipeline pipeline_ssm_conv_f32; @@ -920,6 +949,8 @@ struct vk_device_struct { std::map pipeline_conv2d_f16_f32[CONV_SHAPE_COUNT]; std::map pipeline_conv_transpose_2d_f32[CONV_SHAPE_COUNT]; std::map pipeline_conv_transpose_2d_f16_f32[CONV_SHAPE_COUNT]; + std::map pipeline_conv3d_f32[CONV_SHAPE_COUNT]; + std::map pipeline_conv3d_f16_f32[CONV_SHAPE_COUNT]; vk_pipeline pipeline_conv2d_dw_whcn_f32, pipeline_conv2d_dw_whcn_f16_f32; vk_pipeline pipeline_conv2d_dw_cwhn_f32, pipeline_conv2d_dw_cwhn_f16_f32; @@ -1202,30 +1233,35 @@ struct vk_op_glu_push_constants { uint32_t mode; // 0: default, 1: swapped, 2: split float alpha; // for swiglu_oai float limit; + uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03; - uint32_t ne01; - uint32_t ne02; + uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13; - uint32_t ne11; - uint32_t ne12; + uint32_t nb20; + uint32_t nb21; + uint32_t nb22; + uint32_t nb23; + uint32_t ne21; + uint32_t ne22; + uint32_t misalign_offsets; + uint32_t ne2_012mp; uint32_t ne2_012L; + uint32_t ne2_01mp; uint32_t ne2_01L; + uint32_t ne2_0mp; uint32_t ne2_0L; }; +static_assert(sizeof(vk_op_glu_push_constants) <= 128, "sizeof(vk_op_glu_push_constants) must be <= 128"); struct vk_op_unary_push_constants { uint32_t ne; uint32_t ne00; uint32_t ne01; uint32_t ne02; uint32_t ne03; uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03; uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13; uint32_t nb10; uint32_t nb11; uint32_t nb12; uint32_t nb13; uint32_t misalign_offsets; - float param1; float param2; - uint32_t ne0_012mp; uint32_t ne0_012L; - uint32_t ne0_01mp; uint32_t ne0_01L; - uint32_t ne0_0mp; uint32_t ne0_0L; - uint32_t ne1_012mp; uint32_t ne1_012L; - uint32_t ne1_01mp; uint32_t ne1_01L; - uint32_t ne1_0mp; uint32_t ne1_0L; + float param1; float param2; float param3; float param4; + uint32_t ne0_012mp; uint32_t ne0_01mp; uint32_t ne0_0mp; uint32_t ne0_Ls; + uint32_t ne1_012mp; uint32_t ne1_01mp; uint32_t ne1_0mp; uint32_t ne1_Ls; }; static_assert(sizeof(vk_op_unary_push_constants) <= 128, "sizeof(vk_op_unary_push_constants) must be <= 128"); @@ -1330,6 +1366,10 @@ static void init_fastdiv_values(uint32_t d, uint32_t &mp, uint32_t &L) mp = (uint32_t)((uint64_t{1} << 32) * ((uint64_t{1} << L) - d) / d + 1); } +static uint32_t pack_fastdiv_L(uint32_t L0, uint32_t L1, uint32_t L2) { + return L0 | (L1 << 8) | (L2 << 16); +} + template void init_pushconst_fastdiv(T &p) { GGML_UNUSED(p); static_assert(!std::is_const::value, "unexpected type"); @@ -1337,12 +1377,29 @@ template void init_pushconst_fastdiv(T &p) { template <> void init_pushconst_fastdiv(vk_op_unary_push_constants &p) { // Compute magic values to divide by these six numbers. - init_fastdiv_values(p.ne02*p.ne01*p.ne00, p.ne0_012mp, p.ne0_012L); - init_fastdiv_values(p.ne01*p.ne00, p.ne0_01mp, p.ne0_01L); - init_fastdiv_values(p.ne00, p.ne0_0mp, p.ne0_0L); - init_fastdiv_values(p.ne12*p.ne11*p.ne10, p.ne1_012mp, p.ne1_012L); - init_fastdiv_values(p.ne11*p.ne10, p.ne1_01mp, p.ne1_01L); - init_fastdiv_values(p.ne10, p.ne1_0mp, p.ne1_0L); + uint32_t ne0_012L; + uint32_t ne0_01L; + uint32_t ne0_0L; + uint32_t ne1_012L; + uint32_t ne1_01L; + uint32_t ne1_0L; + + init_fastdiv_values(p.ne02*p.ne01*p.ne00, p.ne0_012mp, ne0_012L); + init_fastdiv_values(p.ne01*p.ne00, p.ne0_01mp, ne0_01L); + init_fastdiv_values(p.ne00, p.ne0_0mp, ne0_0L); + init_fastdiv_values(p.ne12*p.ne11*p.ne10, p.ne1_012mp, ne1_012L); + init_fastdiv_values(p.ne11*p.ne10, p.ne1_01mp, ne1_01L); + init_fastdiv_values(p.ne10, p.ne1_0mp, ne1_0L); + + p.ne0_Ls = pack_fastdiv_L(ne0_012L, ne0_01L, ne0_0L); + p.ne1_Ls = pack_fastdiv_L(ne1_012L, ne1_01L, ne1_0L); +} + +template <> void init_pushconst_fastdiv(vk_op_glu_push_constants &p) { + // GLU linearizes over dst, then uses dst coordinates for src0/src1. + init_fastdiv_values(p.ne22*p.ne21*p.ne20, p.ne2_012mp, p.ne2_012L); + init_fastdiv_values(p.ne21*p.ne20, p.ne2_01mp, p.ne2_01L); + init_fastdiv_values(p.ne20, p.ne2_0mp, p.ne2_0L); } struct vk_op_binary_push_constants { @@ -1525,6 +1582,16 @@ struct vk_op_timestep_embedding_push_constants { uint32_t max_period; }; +struct vk_op_col2im_1d_push_constants { + uint32_t T_out; + uint32_t OC; + uint32_t K_OC; + uint32_t T_in; + uint32_t K; + int32_t stride; + int32_t p0; +}; + struct vk_op_conv_transpose_1d_push_constants { uint32_t Cout; uint32_t Cin; @@ -1629,6 +1696,41 @@ template <> void init_pushconst_fastdiv(vk_op_conv2d_push_constants &p) { init_fastdiv_values(p.OW*p.OH, p.OWOHmp, p.OWOHL); } +struct vk_op_conv3d_push_constants { + uint32_t OC; + uint32_t IC; + uint32_t N; + + uint32_t IW; + uint32_t IH; + uint32_t ID; + uint32_t OW; + uint32_t OH; + uint32_t OD; + + uint32_t nb01; + uint32_t nb02; + uint32_t nb03; + + uint32_t nb11; + uint32_t nb12; + uint32_t nb13; + + uint32_t nb1; + uint32_t nb2; + uint32_t nb3; + + uint32_t OWmp; uint32_t OWL; + uint32_t OWOHmp; uint32_t OWOHL; + uint32_t OWOHODmp; uint32_t OWOHODL; +}; + +template <> void init_pushconst_fastdiv(vk_op_conv3d_push_constants &p) { + init_fastdiv_values(p.OW, p.OWmp, p.OWL); + init_fastdiv_values(p.OW*p.OH, p.OWOHmp, p.OWOHL); + init_fastdiv_values(p.OW*p.OH*p.OD, p.OWOHODmp, p.OWOHODL); +} + struct vk_op_conv2d_dw_push_constants { uint32_t ne; uint32_t batches; @@ -2968,13 +3070,13 @@ static vk_buffer ggml_vk_create_buffer(vk_device& device, size_t size, const std if (memory_type_indices.empty()) { continue; } - buf->memory_property_flags = req_flags; bool done = false; for (auto mtype_it = memory_type_indices.begin(); mtype_it != memory_type_indices.end(); mtype_it++) { try { buf->device_memory = device->device.allocateMemory({ mem_req.size, *mtype_it, &mem_flags_info }); + buf->memory_property_flags = mem_props.memoryTypes[*mtype_it].propertyFlags; done = true; break; } catch (const vk::SystemError& e) { @@ -3040,8 +3142,10 @@ static vk_buffer ggml_vk_create_buffer_device(vk_device& device, size_t size) { buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent, vk::MemoryPropertyFlagBits::eDeviceLocal}); } else if (device->uma) { - // Fall back to host memory type - buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eDeviceLocal, + // On UMA, prefer host-visible memory so direct tensor borrowing works. + // If unavailable, fall back to device-local memory. + buf = ggml_vk_create_buffer(device, size, {vk::MemoryPropertyFlagBits::eDeviceLocal | vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent, + vk::MemoryPropertyFlagBits::eDeviceLocal, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent}); } else if (device->disable_host_visible_vidmem) { if (device->allow_sysmem_fallback) { @@ -3742,7 +3846,7 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) { l_warptile = { 256, 128, 128, 16, subgroup_size_8, 64, 2, tm_m, tn_m, tk_m, subgroup_size_8 }; l_warptile_mmq = l_warptile_mmq_int = { 256, 128, 128, 32, subgroup_size_8, 64, 2, tm_m, tn_m, tk_m, subgroup_size_8 }; l_warptile_mmq_int_k = { 256, 128, 128, 32, subgroup_size_16, 64, 1, 4, 2, 1, subgroup_size_16 }; - } else if (device->vendor_id == VK_VENDOR_ID_INTEL && device->coopmat_support && device->architecture == INTEL_XE2) { + } else if (device->vendor_id == VK_VENDOR_ID_INTEL && device->coopmat_support) { // Xe2/Xe3 with coopmat enabled - warptile performance tuning l_warptile = { 512, 128, 128, 16, subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, subgroup_size_8 }; l_warptile_mmq = { 512, 128, 128, 32, subgroup_size_8, 32, 2, tm_m, tn_m, tk_m, subgroup_size_8 }; @@ -4032,19 +4136,35 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) { } #endif + auto const &ggml_vk_mul_mm_spec = [](std::vector spec, bool aligned) { + spec.push_back(aligned ? 1u : 0u); + return spec; + }; + const int mul_mat_id_param_count = 5; #if defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) if (device->coopmat2) { + auto const &ggml_vk_mul_mm_cm2_spec = [](std::vector spec, bool aligned, bool mul_mat_id) { + if (mul_mat_id && spec.size() > 5) { + spec.insert(spec.begin() + 5, aligned ? 1u : 0u); + } else { + spec.push_back(aligned ? 1u : 0u); + } + if (mul_mat_id && spec.size() == 6) { + spec.push_back(32); + } + return spec; + }; // Create 6 variants, {s,m,l}x{unaligned,aligned} #define CREATE_MM(PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _cm2_len, NAMELC ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1, true); \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _cm2_len, NAMELC ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1, true); \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _cm2_len, NAMELC ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1, true); \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _cm2_len, NAMELC ## _aligned ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align, true); \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _cm2_len, NAMELC ## _aligned ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align, true); \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _cm2_len, NAMELC ## _aligned ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align, true); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _cm2_len, NAMELC ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, ggml_vk_mul_mm_cm2_spec(l_ ## WARPTILE, false, PARAMCOUNT == mul_mat_id_param_count), 1, true); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _cm2_len, NAMELC ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, ggml_vk_mul_mm_cm2_spec(m_ ## WARPTILE, false, PARAMCOUNT == mul_mat_id_param_count), 1, true); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _cm2_len, NAMELC ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, ggml_vk_mul_mm_cm2_spec(s_ ## WARPTILE, false, PARAMCOUNT == mul_mat_id_param_count), 1, true); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## F16ACC ## _cm2_len, NAMELC ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, ggml_vk_mul_mm_cm2_spec(l_ ## WARPTILE, true, PARAMCOUNT == mul_mat_id_param_count), l_align, true); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## F16ACC ## _cm2_len, NAMELC ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, ggml_vk_mul_mm_cm2_spec(m_ ## WARPTILE, true, PARAMCOUNT == mul_mat_id_param_count), m_align, true); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## F16ACC ## _cm2_len, NAMELC ## F16ACC ## _cm2_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, ggml_vk_mul_mm_cm2_spec(s_ ## WARPTILE, true, PARAMCOUNT == mul_mat_id_param_count), s_align, true); \ // Create 2 variants, {f16,f32} accumulator #define CREATE_MM2(PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT) \ @@ -4119,17 +4239,17 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) { // Create 6 variants, {s,m,l}x{unaligned,aligned} #define CREATE_MM(TYPE, PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \ if (device->mul_mat ## ID ## _l[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _cm1_len, NAMELC ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1, false, true); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _cm1_len, NAMELC ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, ggml_vk_mul_mm_spec(l_ ## WARPTILE, false), 1, false, true); \ if (device->mul_mat ## ID ## _m[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _cm1_len, NAMELC ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1, false, true); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _cm1_len, NAMELC ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, ggml_vk_mul_mm_spec(m_ ## WARPTILE, false), 1, false, true); \ if (device->mul_mat ## ID ## _s[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _cm1_len, NAMELC ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1, false, true); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _cm1_len, NAMELC ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, ggml_vk_mul_mm_spec(s_ ## WARPTILE, false), 1, false, true); \ if (device->mul_mat ## ID ## _l[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _cm1_len, NAMELC ## _aligned ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align, false, true); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## F16ACC ## _cm1_len, NAMELC ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, ggml_vk_mul_mm_spec(l_ ## WARPTILE, true), l_align, false, true); \ if (device->mul_mat ## ID ## _m[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _cm1_len, NAMELC ## _aligned ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align, false, true); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## F16ACC ## _cm1_len, NAMELC ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, ggml_vk_mul_mm_spec(m_ ## WARPTILE, true), m_align, false, true); \ if (device->mul_mat ## ID ## _s[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _cm1_len, NAMELC ## _aligned ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align, false, true); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## F16ACC ## _cm1_len, NAMELC ## F16ACC ## _cm1_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, ggml_vk_mul_mm_spec(s_ ## WARPTILE, true), s_align, false, true); \ // Create 2 variants, {f16,f32} accumulator #define CREATE_MM2(TYPE, PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \ @@ -4242,32 +4362,32 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) { // Selects dot2 SPIR-V variant at runtime when device->dot2_f16 is true #define CREATE_MM(TYPE, PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID, REQSUBGROUPSIZE) \ if (device->mul_mat ## ID ## _l[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", (device->dot2_f16 ? NAMELC ## _dot2 ## F16ACC ## _len : NAMELC ## F16ACC ## _len), (device->dot2_f16 ? NAMELC ## _dot2 ## F16ACC ## _data : NAMELC ## F16ACC ## _data), "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", (device->dot2_f16 ? NAMELC ## _dot2 ## F16ACC ## _len : NAMELC ## F16ACC ## _len), (device->dot2_f16 ? NAMELC ## _dot2 ## F16ACC ## _data : NAMELC ## F16ACC ## _data), "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, ggml_vk_mul_mm_spec(l_ ## WARPTILE, false), 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ if (device->mul_mat ## ID ## _m[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", (device->dot2_f16 ? NAMELC ## _dot2 ## F16ACC ## _len : NAMELC ## F16ACC ## _len), (device->dot2_f16 ? NAMELC ## _dot2 ## F16ACC ## _data : NAMELC ## F16ACC ## _data), "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", (device->dot2_f16 ? NAMELC ## _dot2 ## F16ACC ## _len : NAMELC ## F16ACC ## _len), (device->dot2_f16 ? NAMELC ## _dot2 ## F16ACC ## _data : NAMELC ## F16ACC ## _data), "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, ggml_vk_mul_mm_spec(m_ ## WARPTILE, false), 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ if (device->mul_mat ## ID ## _s[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", (device->dot2_f16 ? NAMELC ## _dot2 ## F16ACC ## _len : NAMELC ## F16ACC ## _len), (device->dot2_f16 ? NAMELC ## _dot2 ## F16ACC ## _data : NAMELC ## F16ACC ## _data), "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", (device->dot2_f16 ? NAMELC ## _dot2 ## F16ACC ## _len : NAMELC ## F16ACC ## _len), (device->dot2_f16 ? NAMELC ## _dot2 ## F16ACC ## _data : NAMELC ## F16ACC ## _data), "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, ggml_vk_mul_mm_spec(s_ ## WARPTILE, false), 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ if (device->mul_mat ## ID ## _l[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", (device->dot2_f16 ? NAMELC ## _dot2_aligned ## F16ACC ## _len : NAMELC ## _aligned ## F16ACC ## _len), (device->dot2_f16 ? NAMELC ## _dot2_aligned ## F16ACC ## _data : NAMELC ## _aligned ## F16ACC ## _data), "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", (device->dot2_f16 ? NAMELC ## _dot2 ## F16ACC ## _len : NAMELC ## F16ACC ## _len), (device->dot2_f16 ? NAMELC ## _dot2 ## F16ACC ## _data : NAMELC ## F16ACC ## _data), "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, ggml_vk_mul_mm_spec(l_ ## WARPTILE, true), l_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ if (device->mul_mat ## ID ## _m[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", (device->dot2_f16 ? NAMELC ## _dot2_aligned ## F16ACC ## _len : NAMELC ## _aligned ## F16ACC ## _len), (device->dot2_f16 ? NAMELC ## _dot2_aligned ## F16ACC ## _data : NAMELC ## _aligned ## F16ACC ## _data), "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", (device->dot2_f16 ? NAMELC ## _dot2 ## F16ACC ## _len : NAMELC ## F16ACC ## _len), (device->dot2_f16 ? NAMELC ## _dot2 ## F16ACC ## _data : NAMELC ## F16ACC ## _data), "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, ggml_vk_mul_mm_spec(m_ ## WARPTILE, true), m_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ if (device->mul_mat ## ID ## _s[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", (device->dot2_f16 ? NAMELC ## _dot2_aligned ## F16ACC ## _len : NAMELC ## _aligned ## F16ACC ## _len), (device->dot2_f16 ? NAMELC ## _dot2_aligned ## F16ACC ## _data : NAMELC ## _aligned ## F16ACC ## _data), "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", (device->dot2_f16 ? NAMELC ## _dot2 ## F16ACC ## _len : NAMELC ## F16ACC ## _len), (device->dot2_f16 ? NAMELC ## _dot2 ## F16ACC ## _data : NAMELC ## F16ACC ## _data), "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, ggml_vk_mul_mm_spec(s_ ## WARPTILE, true), s_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ // bf16 scalar path promotes to f32, no dot2 variant #define CREATE_MM_NODOT2(TYPE, PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID, REQSUBGROUPSIZE) \ if (device->mul_mat ## ID ## _l[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, ggml_vk_mul_mm_spec(l_ ## WARPTILE, false), 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ if (device->mul_mat ## ID ## _m[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, ggml_vk_mul_mm_spec(m_ ## WARPTILE, false), 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ if (device->mul_mat ## ID ## _s[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, ggml_vk_mul_mm_spec(s_ ## WARPTILE, false), 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ if (device->mul_mat ## ID ## _l[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _len, NAMELC ## _aligned ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, ggml_vk_mul_mm_spec(l_ ## WARPTILE, true), l_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ if (device->mul_mat ## ID ## _m[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _len, NAMELC ## _aligned ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, ggml_vk_mul_mm_spec(m_ ## WARPTILE, true), m_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ if (device->mul_mat ## ID ## _s[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _len, NAMELC ## _aligned ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## F16ACC ## _len, NAMELC ## F16ACC ## _data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, ggml_vk_mul_mm_spec(s_ ## WARPTILE, true), s_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ #define CREATE_MMQ(TYPE, PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID, REQSUBGROUPSIZE) \ if (device->mul_mat ## ID ## _l_int[TYPE]) { \ @@ -4432,17 +4552,17 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) { // Create 6 variants, {s,m,l}x{unaligned,aligned} #define CREATE_MM(TYPE, PIPELINE_NAME, NAMELC, F16ACC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID, REQSUBGROUPSIZE) \ if (device->mul_mat ## ID ## _l[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->l, #NAMELC #F16ACC "_l", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, ggml_vk_mul_mm_spec(l_ ## WARPTILE, false), 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ if (device->mul_mat ## ID ## _m[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->m, #NAMELC #F16ACC "_m", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, ggml_vk_mul_mm_spec(m_ ## WARPTILE, false), 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ if (device->mul_mat ## ID ## _s[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->s, #NAMELC #F16ACC "_s", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, ggml_vk_mul_mm_spec(s_ ## WARPTILE, false), 1, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ if (device->mul_mat ## ID ## _l[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## _aligned ## F16ACC ## _fp32_len, NAMELC ## _aligned ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, l_ ## WARPTILE, l_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_l, #NAMELC #F16ACC "_aligned_l", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), l_ ## WG_DENOMS, ggml_vk_mul_mm_spec(l_ ## WARPTILE, true), l_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ if (device->mul_mat ## ID ## _m[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## _aligned ## F16ACC ## _fp32_len, NAMELC ## _aligned ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, m_ ## WARPTILE, m_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_m, #NAMELC #F16ACC "_aligned_m", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), m_ ## WG_DENOMS, ggml_vk_mul_mm_spec(m_ ## WARPTILE, true), m_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ if (device->mul_mat ## ID ## _s[TYPE]) \ - ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## _aligned ## F16ACC ## _fp32_len, NAMELC ## _aligned ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, s_ ## WARPTILE, s_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ + ggml_vk_create_pipeline(device, device-> PIPELINE_NAME ->a_s, #NAMELC #F16ACC "_aligned_s", NAMELC ## F16ACC ## _fp32_len, NAMELC ## F16ACC ## _fp32_data, "main", PARAMCOUNT, sizeof(PUSHCONST), s_ ## WG_DENOMS, ggml_vk_mul_mm_spec(s_ ## WARPTILE, true), s_align, false, REQSUBGROUPSIZE > 0, REQSUBGROUPSIZE); \ #define CREATE_MMQ(TYPE, PIPELINE_NAME, NAMELC, WG_DENOMS, WARPTILE, PUSHCONST, PARAMCOUNT, ID) \ if (device->mul_mat ## ID ## _l_int[TYPE]) \ @@ -4599,7 +4719,7 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) { } uint32_t rm_iq = 2 * rm_kq; - const bool use_subgroups = device->subgroup_arithmetic && device->architecture != vk_device_architecture::AMD_GCN; + const bool use_subgroups = device->subgroup_arithmetic; // Ensure a subgroup size >= 16 is available const bool use_subgroups16 = use_subgroups && subgroup_min_size_16; @@ -4837,6 +4957,7 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) { ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ4_NL], "get_rows_iq4_nl_f32", get_rows_iq4_nl_f32_len, get_rows_iq4_nl_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_MXFP4], "get_rows_mxfp4_f32", get_rows_mxfp4_f32_len, get_rows_mxfp4_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_NVFP4], "get_rows_nvfp4_f32", get_rows_nvfp4_f32_len, get_rows_nvfp4_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_get_rows_back_f32, "get_rows_back_f32", get_rows_back_f32_len, get_rows_back_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {256, 1, 1}, {}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256 * 4, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_flash_attn_split_k_reduce, "fa_split_k_reduce", fa_split_k_reduce_len, fa_split_k_reduce_data, "main", 3, sizeof(vk_op_flash_attn_split_k_reduce_push_constants), {1, device->subgroup_size, 1}, {device->subgroup_size}, 1, true); @@ -4861,7 +4982,7 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) { } ggml_vk_create_pipeline(device, device->pipeline_mul_mat_vec_nc_f16_f32, "mul_mat_vec_nc_f16_f32", mul_mat_vec_nc_f16_f32_len, mul_mat_vec_nc_f16_f32_data, "main", mul_mat_vec_num_bindings, sizeof(vk_mat_vec_nc_push_constants), {1, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_norm_f32, "norm_f32", norm_f32_len, norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_norm_f32, "norm_f32", norm_f32_len, norm_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {1, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_group_norm_f32, "group_norm_f32", group_norm_f32_len, group_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_rms_norm_f32, "rms_norm_f32", rms_norm_f32_len, rms_norm_f32_data, "main", 4, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {0, 0}, 1, true); @@ -4969,9 +5090,10 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) { ggml_vk_create_pipeline(device, device->pipeline_acc_f32, "acc_f32", acc_f32_len, acc_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {0, 1}, 1); ggml_vk_create_pipeline(device, device->pipeline_set_f32, "set_f32", acc_f32_len, acc_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {0, 0}, 1); - ggml_vk_create_pipeline(device, device->pipeline_concat_f32, "concat_f32", concat_f32_len, concat_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_concat_f16, "concat_f16", concat_f16_len, concat_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_concat_i8, "concat_i8", concat_i8_len, concat_i8_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_concat_i16, "concat_i16", concat_i16_len, concat_i16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_concat_i32, "concat_i32", concat_i32_len, concat_i32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_concat_i64, "concat_i64", concat_i64_len, concat_i64_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_upscale_nearest_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {GGML_SCALE_MODE_NEAREST}, 1); ggml_vk_create_pipeline(device, device->pipeline_upscale_bilinear_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {GGML_SCALE_MODE_BILINEAR}, 1); @@ -4980,11 +5102,6 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) { ggml_vk_create_pipeline(device, device->pipeline_scale_f32, "scale_f32", scale_f32_len, scale_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_sqr_f32, "sqr_f32", sqr_f32_len, sqr_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_sqrt_f32, "sqrt_f32", sqrt_f32_len, sqrt_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_sin_f32, "sin_f32", sin_f32_len, sin_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_cos_f32, "cos_f32", cos_f32_len, cos_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_log[0], "log_f32", log_f32_len, log_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_log[1], "log_f16", log_f16_len, log_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); @@ -4994,8 +5111,6 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) { ggml_vk_create_pipeline(device, device->pipeline_diag[0], "diag_f32", diag_f32_len, diag_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_diag[1], "diag_f16", diag_f16_len, diag_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_clamp_f32, "clamp_f32", clamp_f32_len, clamp_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_pad_f32, "pad_f32", pad_f32_len, pad_f32_data, "main", 2, sizeof(vk_op_pad_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_roll_f32, "roll_f32", roll_f32_len, roll_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); @@ -5006,8 +5121,8 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) { ggml_vk_create_pipeline(device, device->pipeline_repeat_i16, "repeat_i16", repeat_i16_len, repeat_i16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); #define CREATE_UNARY(name) \ - ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32", name ## _f32_len, name ## _f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); \ - ggml_vk_create_pipeline(device, device->pipeline_ ## name [1], #name "_f16", name ## _f16_len, name ## _f16_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_ ## name [0], #name "_f32", name ## _f32_len, name ## _f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); \ + ggml_vk_create_pipeline(device, device->pipeline_ ## name [1], #name "_f16", name ## _f16_len, name ## _f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); CREATE_UNARY(elu) CREATE_UNARY(gelu) @@ -5015,6 +5130,12 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) { CREATE_UNARY(gelu_quick) CREATE_UNARY(silu) CREATE_UNARY(relu) + CREATE_UNARY(sqr) + CREATE_UNARY(sqrt) + CREATE_UNARY(sin) + CREATE_UNARY(cos) + CREATE_UNARY(clamp) + CREATE_UNARY(leaky_relu) CREATE_UNARY(xielu) CREATE_UNARY(neg) CREATE_UNARY(tanh) @@ -5030,6 +5151,7 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) { CREATE_UNARY(trunc) CREATE_UNARY(sgn) CREATE_UNARY(exp) + CREATE_UNARY(expm1) #undef CREATE_UNARY ggml_vk_create_pipeline(device, device->pipeline_add1_f16_f16, "add1_f16_f16", add1_f16_f16_len, add1_f16_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); @@ -5053,7 +5175,6 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) { CREATE_GLU(geglu_quick) #undef CREATE_GLU - ggml_vk_create_pipeline(device, device->pipeline_leaky_relu_f32, "leaky_relu_f32", leaky_relu_f32_len, leaky_relu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_silu_back_f32, "silu_back_f32", silu_back_f32_len, silu_back_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_diag_mask_inf_f32, "diag_mask_inf_f32", diag_mask_inf_f32_len, diag_mask_inf_f32_data, "main", 2, sizeof(vk_op_diag_mask_push_constants), {1, 512, 1}, {}, 1, true); @@ -5174,6 +5295,9 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) { ggml_vk_create_pipeline(device, device->pipeline_timestep_embedding_f32, "timestep_embedding_f32", timestep_embedding_f32_len, timestep_embedding_f32_data, "main", 2, sizeof(vk_op_timestep_embedding_push_constants), {256, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_conv_transpose_1d_f32, "conv_transpose_1d_f32", conv_transpose_1d_f32_len, conv_transpose_1d_f32_data, "main", 3, sizeof(vk_op_conv_transpose_1d_push_constants), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_col2im_1d_f32, "col2im_1d_f32", col2im_1d_f32_len, col2im_1d_f32_data, "main", 2, sizeof(vk_op_col2im_1d_push_constants), {256, 1, 1}, {}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_col2im_1d_f16, "col2im_1d_f16", col2im_1d_f16_len, col2im_1d_f16_data, "main", 2, sizeof(vk_op_col2im_1d_push_constants), {256, 1, 1}, {}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_col2im_1d_bf16, "col2im_1d_bf16", col2im_1d_bf16_len, col2im_1d_bf16_data, "main", 2, sizeof(vk_op_col2im_1d_push_constants), {256, 1, 1}, {}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_snake_f32, "snake_f32", snake_f32_len, snake_f32_data, "main", 4, sizeof(vk_op_snake_push_constants), {256, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_snake_f16, "snake_f16", snake_f16_len, snake_f16_data, "main", 4, sizeof(vk_op_snake_push_constants), {256, 1, 1}, {}, 1); @@ -5186,14 +5310,14 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) { ggml_vk_create_pipeline(device, device->pipeline_rwkv_wkv7_f32, "rwkv_wkv7_f32", rwkv_wkv7_f32_len, rwkv_wkv7_f32_data, "main", 8, sizeof(vk_op_rwkv_wkv7_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); { - const uint32_t gdn_sizes[] = {32, 64, 128}; + const uint32_t gdn_sizes[] = {16, 32, 64, 128}; const char * gdn_names[][2] = { + {"gated_delta_net_f32_d16", "gated_delta_net_f32_d16_kda"}, {"gated_delta_net_f32_d32", "gated_delta_net_f32_d32_kda"}, {"gated_delta_net_f32_d64", "gated_delta_net_f32_d64_kda"}, {"gated_delta_net_f32_d128", "gated_delta_net_f32_d128_kda"}, }; - const bool use_subgroup_reduce = device->subgroup_arithmetic; - for (uint32_t si = 0; si < 3; si++) { + for (uint32_t si = 0; si < 4; si++) { const uint32_t S_V = gdn_sizes[si]; GGML_ASSERT(is_pow2(S_V)); @@ -5207,10 +5331,29 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) { lanes_per_column = std::min(S_V, device->subgroup_size); } - const bool need_clustered_shader = lanes_per_column != 1 && (lanes_per_column < device->subgroup_size); + // gated_delta_net.comp relies on S_V % COLS_PER_WG == 0 and + // S_V % LANES_PER_COLUMN == 0 to avoid bounds checks. + while (lanes_per_column > 1u) { + const bool valid_lanes = (device->subgroup_size % lanes_per_column) == 0 && + (S_V % lanes_per_column) == 0; + const uint32_t cols_per_wg = valid_lanes ? device->subgroup_size / lanes_per_column : 0; + if (valid_lanes && cols_per_wg > 0 && (S_V % cols_per_wg) == 0) { + break; + } + lanes_per_column >>= 1u; + } + + GGML_ASSERT((device->subgroup_size % lanes_per_column) == 0); + GGML_ASSERT((S_V % lanes_per_column) == 0); + GGML_ASSERT((S_V % (device->subgroup_size / lanes_per_column)) == 0); + + const bool need_partial_subgroup_reduce = lanes_per_column != 1u && lanes_per_column < device->subgroup_size; + const bool use_clustered_reduce = device->subgroup_arithmetic && device->subgroup_clustered && need_partial_subgroup_reduce; + const bool use_subgroup_reduce = device->subgroup_arithmetic && !need_partial_subgroup_reduce; + const bool use_subgroup_ops = use_clustered_reduce || use_subgroup_reduce; size_t gdn_len; const void * gdn_data; - if (use_subgroup_reduce && need_clustered_shader) { + if (use_clustered_reduce) { gdn_len = gated_delta_net_f32_len; gdn_data = (const void *)gated_delta_net_f32_data; } else if (use_subgroup_reduce) { @@ -5227,7 +5370,7 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) { for (uint32_t kda = 0; kda < 2; kda++) { ggml_vk_create_pipeline(device, device->pipeline_gated_delta_net[si][kda], gdn_names[si][kda], gdn_len, gdn_data, "main", 7, sizeof(vk_op_gated_delta_net_push_constants), - wg_denoms, {S_V, kda, device->subgroup_size, lanes_per_column}, 1, true, use_subgroup_reduce, device->subgroup_size); + wg_denoms, {S_V, kda, device->subgroup_size, lanes_per_column}, 1, true, use_subgroup_ops, device->subgroup_size); } } } @@ -5248,7 +5391,7 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) { ggml_vk_create_pipeline(device, device->pipeline_opt_step_sgd_f32, "opt_step_sgd_f32", opt_step_sgd_f32_len, opt_step_sgd_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - // conv2d, conv_transpose_2d + // conv2d, conv_transpose_2d, conv3d for (uint32_t s = 0; s < CONV_SHAPE_COUNT; ++s) { // smaller WG for the small-tile fallback gives more concurrent WGs per SM uint32_t conv2d_WG_SIZE = (s == CONV_SHAPE_64x32) ? 128 : 256; @@ -5311,8 +5454,8 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) { return (conv2d_BS.K * (conv2d_BS.CRS + pad) + conv2d_BS.CRS * (conv2d_BS.NPQ + pad) + csh_elems) * elem_size; }; - // coopmat1 needs to store the output through shared memory, so check up front - // whether it'll fit and disable it before applying coopmat1 parameters. + // 2D, transpose-2D, and 3D conv use the same KxCRS @ CRSxNPQ shmem + // layout. cm1 needs Csh for output, so check before applying cm1 params. if (conv2d_use_cm1 && device->properties.limits.maxComputeSharedMemorySize < shmem_req(conv2d_cm1_shmem_pad, true, true)) { conv2d_use_cm1 = false; } @@ -5404,6 +5547,53 @@ static void ggml_vk_load_shaders(vk_device& device, vk_pipeline requested) { } #undef CREATE_CONV #undef CREATE_CONVS + + std::vector conv3d_spec_constants = { conv2d_WG_SIZE, conv2d_BS.K, conv2d_BS.CRS, conv2d_BS.NPQ, conv2d_TS_K, conv2d_SHMEM_PAD }; +#define CREATE_CONV3D(type_suffix, spv_suffix) \ + for (auto &c : device->pipeline_conv3d##type_suffix[s]) { \ + const vk_conv3d_pipeline_state &state = c.first; \ + std::vector spec_constants_cpy = conv3d_spec_constants; \ + spec_constants_cpy.push_back(state.s0); \ + spec_constants_cpy.push_back(state.s1); \ + spec_constants_cpy.push_back(state.s2); \ + spec_constants_cpy.push_back(state.p0); \ + spec_constants_cpy.push_back(state.p1); \ + spec_constants_cpy.push_back(state.p2); \ + spec_constants_cpy.push_back(state.d0); \ + spec_constants_cpy.push_back(state.d1); \ + spec_constants_cpy.push_back(state.d2); \ + spec_constants_cpy.push_back(state.KW); \ + spec_constants_cpy.push_back(state.KH); \ + spec_constants_cpy.push_back(state.KD); \ + spec_constants_cpy.push_back(state.aligned); \ + spec_constants_cpy.push_back(conv2d_csh_store); \ + spec_constants_cpy.push_back(conv2d_WM); \ + spec_constants_cpy.push_back(conv2d_WN); \ + ggml_vk_create_pipeline( \ + device, c.second, "conv3d" #type_suffix, \ + conv3d##type_suffix##spv_suffix##_len, conv3d##type_suffix##spv_suffix##_data, "main", 3, \ + sizeof(vk_op_conv3d_push_constants), wg_denoms, spec_constants_cpy, 1, true, conv2d_required_subgroup_size != 0, conv2d_required_subgroup_size); \ + } +#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) + if (device->coopmat2) { + CREATE_CONV3D(_f32, _cm2) + CREATE_CONV3D(_f16_f32, _cm2) + } else +#endif +#if defined(VK_KHR_cooperative_matrix) && defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT) + if (conv2d_use_cm1) { + CREATE_CONV3D(_f32, _cm1) + CREATE_CONV3D(_f16_f32, _cm1) + } else +#endif + if (conv2d_UNROLL) { + CREATE_CONV3D(_f32, _unroll) + CREATE_CONV3D(_f16_f32, _unroll) + } else { + CREATE_CONV3D(_f32, ) + CREATE_CONV3D(_f16_f32, ) + } +#undef CREATE_CONV3D } ggml_vk_create_pipeline(device, device->pipeline_conv2d_dw_whcn_f32, "conv2d_dw_whcn_f32", conv2d_dw_whcn_f32_len, conv2d_dw_whcn_f32_data, "main", 3, sizeof(vk_op_conv2d_dw_push_constants), {512, 1, 1}, {}, 1); @@ -5698,6 +5888,14 @@ static vk_device ggml_vk_get_device(size_t idx) { device->subgroup_vote = (vk11_props.subgroupSupportedStages & vk::ShaderStageFlagBits::eCompute) && (vk11_props.subgroupSupportedOperations & vk::SubgroupFeatureFlagBits::eVote); + // Submit at least every 100 nodes, in case there are workloads without as much matmul. + device->max_nodes_per_submit = 100; + const char* GGML_VK_MAX_NODES_PER_SUBMIT = getenv("GGML_VK_MAX_NODES_PER_SUBMIT"); + if (GGML_VK_MAX_NODES_PER_SUBMIT != nullptr) { + uint32_t max_nodes_per_submit = std::stoul(GGML_VK_MAX_NODES_PER_SUBMIT); + device->max_nodes_per_submit = std::max(max_nodes_per_submit, 1u); + } + const bool force_disable_f16 = getenv("GGML_VK_DISABLE_F16") != nullptr; device->fp16 = !force_disable_f16 && fp16_storage && fp16_compute; @@ -6172,9 +6370,8 @@ static vk_device ggml_vk_get_device(size_t idx) { break; case VK_VENDOR_ID_INTEL: { // Current Windows driver does not expose BF16 support. - // We only want to use l_warptile if coopmat is available and is Xe2+ - const bool xe2_with_coopmat = device->coopmat_support && device->architecture == INTEL_XE2; - const bool use_l_warptile = (i == GGML_TYPE_BF16) ? (device->coopmat_bf16_support && xe2_with_coopmat) : xe2_with_coopmat; + // We only want to use l_warptile if coopmat is available + const bool use_l_warptile = (i == GGML_TYPE_BF16) ? (device->coopmat_bf16_support && device->coopmat_support) : device->coopmat_support; device->mul_mat_l[i] = use_l_warptile; device->mul_mat_id_l[i] = use_l_warptile; device->mul_mat_m[i] = true; @@ -6202,6 +6399,19 @@ static vk_device ggml_vk_get_device(size_t idx) { break; } +#if VK_HEADER_VERSION >= 287 + // Honeykrisp driver for Asahi Linux doesn't report VK_VENDOR_ID_APPLE. + // Check for Honeykrisp driver and force same configuration as the VK_VENDOR_ID_APPLE case. + if (device->driver_id == vk::DriverId::eMesaHoneykrisp) { + device->mul_mat_l[i] = false; + device->mul_mat_m[i] = true; + device->mul_mat_s[i] = false; + device->mul_mat_id_l[i] = false; + device->mul_mat_id_m[i] = true; + device->mul_mat_id_s[i] = false; + } +#endif + device->mul_mat_l_int[i] = device->mul_mat_l[i]; device->mul_mat_m_int[i] = device->mul_mat_m[i]; device->mul_mat_s_int[i] = device->mul_mat_s[i]; @@ -7604,8 +7814,12 @@ static void ggml_vk_buffer_write_2d(vk_buffer& dst, size_t offset, const void * if(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible) { GGML_ASSERT(dst->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent); - for (size_t i = 0; i < height; i++) { - memcpy((uint8_t *)dst->ptr + offset + i * dpitch, (const uint8_t *) src + i * spitch, width); + if (width == spitch && width == dpitch) { + memcpy((uint8_t *)dst->ptr + offset, src, width * height); + } else { + for (size_t i = 0; i < height; i++) { + memcpy((uint8_t *)dst->ptr + offset + i * dpitch, (const uint8_t *) src + i * spitch, width); + } } } else { std::lock_guard guard(dst->device->mutex); @@ -7724,8 +7938,29 @@ static void ggml_vk_buffer_read_2d(vk_buffer& src, size_t offset, void * dst, si if(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible && src->device->uma) { GGML_ASSERT(src->memory_property_flags & vk::MemoryPropertyFlagBits::eHostCoherent); - for (size_t i = 0; i < height; i++) { - memcpy((uint8_t *) dst + i * dpitch, (const uint8_t *) src->ptr + offset + i * spitch, width); + std::lock_guard guard(src->device->mutex); + vk_context subctx = ggml_vk_create_temporary_context(src->device->compute_queue.cmd_pool); + ggml_vk_ctx_begin(src->device, subctx); + subctx->s->buffer->buf.pipelineBarrier( + vk::PipelineStageFlagBits::eComputeShader | vk::PipelineStageFlagBits::eTransfer, + vk::PipelineStageFlagBits::eHost, + {}, + { { vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferWrite, + vk::AccessFlagBits::eHostRead } }, + {}, {}); + ggml_vk_ctx_end(subctx); + ggml_vk_submit(subctx, src->device->fence); + VK_CHECK(src->device->device.waitForFences({ src->device->fence }, true, UINT64_MAX), + "vk_buffer_read_2d uma waitForFences"); + src->device->device.resetFences({ src->device->fence }); + ggml_vk_queue_command_pools_cleanup(src->device); + + if (width == spitch && width == dpitch) { + memcpy(dst, (const uint8_t *) src->ptr + offset, width * height); + } else { + for (size_t i = 0; i < height; i++) { + memcpy((uint8_t *) dst + i * dpitch, (const uint8_t *) src->ptr + offset + i * spitch, width); + } } } else { std::lock_guard guard(src->device->mutex); @@ -8154,7 +8389,6 @@ static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, const static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context& subctx, vk_pipeline pipeline, const ggml_tensor * tensor, const vk_subbuffer & in, const vk_subbuffer & out) { VK_LOG_DEBUG("ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), "; std::cerr << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")"); - const int tensor_type_size = ggml_type_size(tensor->type); const uint32_t ne = ggml_nelements(tensor); std::array elements; @@ -8167,14 +8401,11 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context& elements = { ne, 1, 1 }; } - vk_op_unary_push_constants pc = { - (uint32_t)ne, - (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], (uint32_t)tensor->ne[3], (uint32_t)tensor->nb[0] / tensor_type_size, (uint32_t)tensor->nb[1] / tensor_type_size, (uint32_t)tensor->nb[2] / tensor_type_size, (uint32_t)tensor->nb[3] / tensor_type_size, - (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], (uint32_t)tensor->ne[3], 1 , (uint32_t)tensor->ne[0] , (uint32_t)(tensor->ne[0] * tensor->ne[1]) , (uint32_t)(tensor->ne[0] * tensor->ne[1] * tensor->ne[2]), - 0, - 0.0f, 0.0f, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - }; + vk_op_unary_push_constants pc = vk_op_unary_push_constants_init(tensor, tensor, ne); + pc.nb10 = 1; + pc.nb11 = (uint32_t)tensor->ne[0]; + pc.nb12 = (uint32_t)(tensor->ne[0] * tensor->ne[1]); + pc.nb13 = (uint32_t)(tensor->ne[0] * tensor->ne[1] * tensor->ne[2]); init_pushconst_fastdiv(pc); ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, pc, elements); ggml_vk_sync_buffers(ctx, subctx); @@ -8188,7 +8419,6 @@ static void ggml_vk_cpy_to_strided( uint32_t nb10, uint32_t nb11, uint32_t nb12, uint32_t nb13) { VK_LOG_DEBUG("ggml_vk_cpy_to_strided((" << tensor << ", type=" << tensor->type << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), "; std::cerr << "dst_nb=(" << nb10 << ", " << nb11 << ", " << nb12 << ", " << nb13 << "), buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")"); - const int tensor_type_size = ggml_type_size(tensor->type); const uint32_t ne = ggml_nelements(tensor); std::array elements; @@ -8201,14 +8431,11 @@ static void ggml_vk_cpy_to_strided( elements = { ne, 1, 1 }; } - vk_op_unary_push_constants pc = { - (uint32_t)ne, - (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], (uint32_t)tensor->ne[3], (uint32_t)tensor->nb[0] / tensor_type_size, (uint32_t)tensor->nb[1] / tensor_type_size, (uint32_t)tensor->nb[2] / tensor_type_size, (uint32_t)tensor->nb[3] / tensor_type_size, - (uint32_t)tensor->ne[0], (uint32_t)tensor->ne[1], (uint32_t)tensor->ne[2], (uint32_t)tensor->ne[3], nb10, nb11, nb12, nb13, - 0, - 0.0f, 0.0f, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - }; + vk_op_unary_push_constants pc = vk_op_unary_push_constants_init(tensor, tensor, ne); + pc.nb10 = nb10; + pc.nb11 = nb11; + pc.nb12 = nb12; + pc.nb13 = nb13; init_pushconst_fastdiv(pc); ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, pc, elements); ggml_vk_sync_buffers(ctx, subctx); @@ -10198,6 +10425,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const return ctx->device->pipeline_get_rows_f32[src0->type]; } return nullptr; + case GGML_OP_GET_ROWS_BACK: + if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_get_rows_back_f32; + } + return nullptr; case GGML_OP_ACC: if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_acc_f32; @@ -10260,17 +10492,27 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const return ctx->device->pipeline_add_id_f32; } return nullptr; - case GGML_OP_CONCAT: - if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return ctx->device->pipeline_concat_f32; + case GGML_OP_CONCAT: { + if (src0->type != src1->type || src0->type != dst->type) { + return nullptr; } - if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { - return ctx->device->pipeline_concat_f16; + if (ggml_blck_size(src0->type) != 1) { + return nullptr; } - if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_I32) { + const size_t type_size = ggml_type_size(src0->type); + switch (type_size) { + case 1: + return ctx->device->pipeline_concat_i8; + case 2: + return ctx->device->pipeline_concat_i16; + case 4: return ctx->device->pipeline_concat_i32; + case 8: + return ctx->device->pipeline_concat_i64; + default: + return nullptr; } - return nullptr; + } case GGML_OP_UPSCALE: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { uint32_t mode = (ggml_get_op_params_i32(dst, 0) & (0xFF | GGML_SCALE_FLAG_ANTIALIAS)); @@ -10294,23 +10536,27 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const } return nullptr; case GGML_OP_SQR: - if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return ctx->device->pipeline_sqr_f32; + if (src0->type == dst->type && + (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)) { + return ctx->device->pipeline_sqr[dst->type == GGML_TYPE_F16]; } return nullptr; case GGML_OP_SQRT: - if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return ctx->device->pipeline_sqrt_f32; + if (src0->type == dst->type && + (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)) { + return ctx->device->pipeline_sqrt[dst->type == GGML_TYPE_F16]; } return nullptr; case GGML_OP_SIN: - if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return ctx->device->pipeline_sin_f32; + if (src0->type == dst->type && + (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)) { + return ctx->device->pipeline_sin[dst->type == GGML_TYPE_F16]; } return nullptr; case GGML_OP_COS: - if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return ctx->device->pipeline_cos_f32; + if (src0->type == dst->type && + (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)) { + return ctx->device->pipeline_cos[dst->type == GGML_TYPE_F16]; } return nullptr; case GGML_OP_LOG: @@ -10332,8 +10578,9 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const } return nullptr; case GGML_OP_CLAMP: - if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return ctx->device->pipeline_clamp_f32; + if (src0->type == dst->type && + (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)) { + return ctx->device->pipeline_clamp[dst->type == GGML_TYPE_F16]; } return nullptr; case GGML_OP_PAD: @@ -10413,6 +10660,8 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const switch (ggml_get_unary_op(dst)) { case GGML_UNARY_OP_EXP: return ctx->device->pipeline_exp[dst->type == GGML_TYPE_F16]; + case GGML_UNARY_OP_EXPM1: + return ctx->device->pipeline_expm1[dst->type == GGML_TYPE_F16]; case GGML_UNARY_OP_ELU: return ctx->device->pipeline_elu[dst->type == GGML_TYPE_F16]; case GGML_UNARY_OP_SILU: @@ -10631,6 +10880,13 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const return ctx->device->pipeline_conv_transpose_1d_f32; } return nullptr; + case GGML_OP_COL2IM_1D: + switch (src0->type) { + case GGML_TYPE_F32: return ctx->device->pipeline_col2im_1d_f32; + case GGML_TYPE_F16: return ctx->device->pipeline_col2im_1d_f16; + case GGML_TYPE_BF16: return ctx->device->pipeline_col2im_1d_bf16; + default: return nullptr; + } case GGML_OP_POOL_2D: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_pool2d_f32; @@ -10652,9 +10908,10 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const const uint32_t kda = (dst->src[3]->ne[0] == (int64_t)S_v) ? 1 : 0; uint32_t si; switch (S_v) { - case 32: si = 0; break; - case 64: si = 1; break; - case 128: si = 2; break; + case 16: si = 0; break; + case 32: si = 1; break; + case 64: si = 2; break; + case 128: si = 3; break; default: return nullptr; } return ctx->device->pipeline_gated_delta_net[si][kda]; @@ -10691,8 +10948,9 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const } return nullptr; case GGML_OP_LEAKY_RELU: - if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return ctx->device->pipeline_leaky_relu_f32; + if (src0->type == dst->type && + (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)) { + return ctx->device->pipeline_leaky_relu[dst->type == GGML_TYPE_F16]; } return nullptr; case GGML_OP_CONV_2D: @@ -10769,6 +11027,61 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const } } return nullptr; + case GGML_OP_CONV_3D: + if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + const uint32_t OC = (uint32_t)ggml_get_op_params_i32(dst, 11); + const uint32_t IC = (uint32_t)ggml_get_op_params_i32(dst, 9); + const uint32_t N = (uint32_t)ggml_get_op_params_i32(dst, 10); + const uint32_t NPQ = N * dst->ne[2] * dst->ne[1] * dst->ne[0]; + const vk_conv_shapes shape = ggml_vk_conv_select_shape(ctx, OC, NPQ); + + const uint32_t KW = (uint32_t)src0->ne[0]; + const uint32_t KH = (uint32_t)src0->ne[1]; + const uint32_t KD = (uint32_t)src0->ne[2]; + const uint32_t s0 = (uint32_t)ggml_get_op_params_i32(dst, 0); + const uint32_t s1 = (uint32_t)ggml_get_op_params_i32(dst, 1); + const uint32_t s2 = (uint32_t)ggml_get_op_params_i32(dst, 2); + const uint32_t p0 = (uint32_t)ggml_get_op_params_i32(dst, 3); + const uint32_t p1 = (uint32_t)ggml_get_op_params_i32(dst, 4); + const uint32_t p2 = (uint32_t)ggml_get_op_params_i32(dst, 5); + const uint32_t d0 = (uint32_t)ggml_get_op_params_i32(dst, 6); + const uint32_t d1 = (uint32_t)ggml_get_op_params_i32(dst, 7); + const uint32_t d2 = (uint32_t)ggml_get_op_params_i32(dst, 8); + + const uint32_t CRS = IC * KW * KH * KD; + const uint32_t BS_K = vk_conv_block_sizes[shape].K; + const uint32_t BS_CRS = vk_conv_block_sizes[shape].CRS; + const uint32_t BS_NPQ = vk_conv_block_sizes[shape].NPQ; + const uint32_t aligned = ((OC % BS_K == 0) && + (CRS % BS_CRS == 0) && + (NPQ % BS_NPQ == 0)) ? 1u : 0u; + + vk_conv3d_pipeline_state conv3d_pipeline_state(s0, s1, s2, p0, p1, p2, d0, d1, d2, KW, KH, KD, aligned); + + std::map *pipelines = nullptr; + if (src0->type == GGML_TYPE_F32) { + pipelines = &ctx->device->pipeline_conv3d_f32[shape]; + } else if (src0->type == GGML_TYPE_F16) { + pipelines = &ctx->device->pipeline_conv3d_f16_f32[shape]; + } else { + return nullptr; + } + + vk_pipeline pipeline = nullptr; + + { + std::lock_guard guard(ctx->device->compile_mutex); + auto it = pipelines->find(conv3d_pipeline_state); + if (it != pipelines->end()) { + pipeline = it->second; + } else { + (*pipelines)[conv3d_pipeline_state] = pipeline = std::make_shared(); + } + } + + return pipeline; + } + return nullptr; case GGML_OP_ADD1: if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { return ctx->device->pipeline_add1_f16_f16; @@ -10811,6 +11124,21 @@ template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk GGML_UNUSED(src3); } +template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_glu_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) { + const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type); + const uint32_t b_offset = src1 ? get_misalign_bytes(ctx, src1) / ggml_type_size(src1->type) : a_offset; + const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type); + + GGML_ASSERT(a_offset < (1u << 8)); + GGML_ASSERT(b_offset < (1u << 8)); + GGML_ASSERT(d_offset < (1u << 8)); + + p.misalign_offsets = (a_offset << 16) | (b_offset << 8) | d_offset; + + GGML_UNUSED(src2); + GGML_UNUSED(src3); +} + template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_sum_rows_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * src3, ggml_tensor * dst) { const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type); const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type); @@ -11004,6 +11332,10 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co elements[1] = std::min(elements[1], ctx->device->properties.limits.maxComputeWorkGroupCount[1]); elements[2] = std::min(elements[2], ctx->device->properties.limits.maxComputeWorkGroupCount[2]); break; + case GGML_OP_GET_ROWS_BACK: + elements = { (uint32_t)dst->ne[0], (uint32_t)dst->ne[1], 1 }; + elements[1] = std::min(elements[1], ctx->device->properties.limits.maxComputeWorkGroupCount[1]); + break; case GGML_OP_ARGSORT: GGML_ASSERT(0); break; @@ -11061,6 +11393,10 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co { elements = {uint32_t(src0->ne[1]), 1, 1}; // parallelize in {Cout, 1, 1} } break; + case GGML_OP_COL2IM_1D: + { + elements = { uint32_t(dst->ne[0]), uint32_t(dst->ne[1]), 1 }; + } break; case GGML_OP_POOL_2D: { const uint32_t N = dst->ne[3]; @@ -11085,6 +11421,21 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co GGML_ABORT("invalid push constant type for CONV_2D"); } break; + case GGML_OP_CONV_3D: + if constexpr (std::is_same_v) { + const uint32_t NPQ = pc.N * pc.OD * pc.OH * pc.OW; + const vk_conv_shapes shape = ggml_vk_conv_select_shape(ctx, pc.OC, NPQ); + const uint32_t NPQ_blocks = CEIL_DIV(NPQ, vk_conv_block_sizes[shape].NPQ); + + elements = { pc.OC, NPQ_blocks, 1 }; + if (elements[1] > 512) { + elements[2] = CEIL_DIV(elements[1], 512); + elements[1] = 512; + } + } else { + GGML_ABORT("invalid push constant type for CONV_3D"); + } + break; case GGML_OP_ADD: case GGML_OP_SUB: case GGML_OP_DIV: @@ -11101,6 +11452,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co case GGML_OP_TRI: case GGML_OP_DIAG: case GGML_OP_CLAMP: + case GGML_OP_LEAKY_RELU: case GGML_OP_PAD: case GGML_OP_ROLL: case GGML_OP_REPEAT: @@ -11245,6 +11597,21 @@ static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context& subctx, }); } +static void ggml_vk_get_rows_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + const uint32_t src0_type_size = ggml_type_size(src0->type); + const uint32_t src1_type_size = ggml_type_size(src1->type); + const uint32_t dst_type_size = ggml_type_size(dst->type); + + ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_GET_ROWS_BACK, { + (uint32_t)ggml_nelements(src0), + (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, + (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2], (uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, + (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, + 0, + 0.0f, 0.0f, 0, + }); +} + static void ggml_vk_acc(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t src1_type_size = ggml_type_size(src1->type); @@ -11952,8 +12319,10 @@ static void ggml_vk_silu_back(ggml_backend_vk_context * ctx, vk_context& subctx, static void ggml_vk_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { float * op_params = (float *)dst->op_params; + vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst); + p.param1 = op_params[0]; - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f, 0.0f, 0.0f }); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_NORM, std::move(p)); } static void ggml_vk_group_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { @@ -12160,17 +12529,17 @@ static void ggml_vk_l2_norm(ggml_backend_vk_context * ctx, vk_context& subctx, c } static void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f, 0.0f, 0.0f }); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_UNARY, vk_op_unary_push_constants_init(src0, dst)); } static void ggml_vk_xielu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { float * op_params = (float *)dst->op_params; - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_UNARY, - { - (uint32_t)ggml_nelements(src0), 0, - op_params[1], op_params[2], op_params[3], op_params[4] - } - ); + vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst); + p.param1 = op_params[1]; + p.param2 = op_params[2]; + p.param3 = op_params[3]; + p.param4 = op_params[4]; + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_UNARY, std::move(p)); } static void ggml_vk_glu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -12190,6 +12559,9 @@ static void ggml_vk_glu(ggml_backend_vk_context * ctx, vk_context& subctx, const } const uint32_t mode = split ? 2 : (swapped ? 1 : 0); + const uint32_t src0_type_size = ggml_type_size(src0->type); + const uint32_t src1_type_size = split ? ggml_type_size(src1->type) : src0_type_size; + const uint32_t dst_type_size = ggml_type_size(dst->type); ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_GLU, { @@ -12199,16 +12571,22 @@ static void ggml_vk_glu(ggml_backend_vk_context * ctx, vk_context& subctx, const mode, alpha, limit, - (uint32_t)(src0->nb[1] / src0->nb[0]), - (uint32_t)(src0->nb[2] / src0->nb[0]), - (uint32_t)(src0->nb[3] / src0->nb[0]), - (uint32_t)src0->ne[1], - (uint32_t)src0->ne[2], - (uint32_t)(dst->nb[1] / dst->nb[0]), - (uint32_t)(dst->nb[2] / dst->nb[0]), - (uint32_t)(dst->nb[3] / dst->nb[0]), + (uint32_t)(src0->nb[0] / src0_type_size), + (uint32_t)(src0->nb[1] / src0_type_size), + (uint32_t)(src0->nb[2] / src0_type_size), + (uint32_t)(src0->nb[3] / src0_type_size), + (uint32_t)((split ? src1->nb[0] : src0->nb[0]) / src1_type_size), + (uint32_t)((split ? src1->nb[1] : src0->nb[1]) / src1_type_size), + (uint32_t)((split ? src1->nb[2] : src0->nb[2]) / src1_type_size), + (uint32_t)((split ? src1->nb[3] : src0->nb[3]) / src1_type_size), + (uint32_t)(dst->nb[0] / dst_type_size), + (uint32_t)(dst->nb[1] / dst_type_size), + (uint32_t)(dst->nb[2] / dst_type_size), + (uint32_t)(dst->nb[3] / dst_type_size), (uint32_t)dst->ne[1], - (uint32_t)dst->ne[2] + (uint32_t)dst->ne[2], + 0, + 0, 0, 0, 0, 0, 0, }); } @@ -12841,6 +13219,32 @@ static void ggml_vk_conv_transpose_1d(ggml_backend_vk_context * ctx, vk_context& ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONV_TRANSPOSE_1D, std::move(p)); } +static void ggml_vk_col2im_1d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { + // src0: [K_OC, T_in] columns from matmul + // dst: [T_out, OC] + + const int32_t stride = dst->op_params[0]; + const int32_t oc = dst->op_params[1]; + const int32_t p0 = dst->op_params[2]; + + const uint32_t K_OC = static_cast(src0->ne[0]); + const uint32_t T_in = static_cast(src0->ne[1]); + const uint32_t T_out = static_cast(dst->ne[0]); + const uint32_t OC = static_cast(oc); + const uint32_t K = K_OC / OC; + + vk_op_col2im_1d_push_constants p{}; + p.T_out = T_out; + p.OC = OC; + p.K_OC = K_OC; + p.T_in = T_in; + p.K = K; + p.stride = stride; + p.p0 = p0; + + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_COL2IM_1D, std::move(p)); +} + // Dispatch the fused snake activation: y = x + sin^2(a * x) * inv_b. // Match the naive mul -> sin -> sqr -> mul -> add chain and run the // dedicated kernel directly. The pattern is validated by @@ -12948,6 +13352,51 @@ static void ggml_vk_conv_2d(ggml_backend_vk_context * ctx, vk_context & subctx, ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, dst->op, std::move(p)); } +static void ggml_vk_conv_3d(ggml_backend_vk_context * ctx, vk_context & subctx, const ggml_tensor * src0, + const ggml_tensor * src1, ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + GGML_TENSOR_BINARY_OP_LOCALS + GGML_ASSERT(nb00 == sizeof(float) || nb00 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb10 == sizeof(float)); + GGML_ASSERT(nb0 == sizeof(float)); + + vk_op_conv3d_push_constants p{}; + p.IC = static_cast(ggml_get_op_params_i32(dst, 9)); + p.N = static_cast(ggml_get_op_params_i32(dst, 10)); + p.OC = static_cast(ggml_get_op_params_i32(dst, 11)); + GGML_ASSERT(src0->ne[3] == (int64_t)p.IC * p.OC); + GGML_ASSERT(src1->ne[3] == (int64_t)p.IC * p.N); + GGML_ASSERT(dst->ne[3] == (int64_t)p.OC * p.N); + + p.IW = static_cast(ne10); + p.IH = static_cast(ne11); + p.ID = static_cast(ne12); + p.OW = static_cast(ne0); + p.OH = static_cast(ne1); + p.OD = static_cast(ne2); + + // the shader clamps src addresses to p.IC * p.N * p.IW * p.IH * p.ID - 1 in uint32, so the + // total input element count must fit in a uint32. + GGML_ASSERT((uint64_t)p.IC * p.N * p.IW * p.IH * p.ID <= 0xFFFFFFFFull); + + p.nb01 = static_cast(nb01 / nb00); + p.nb02 = static_cast(nb02 / nb00); + p.nb03 = static_cast(nb03 / nb00); + + p.nb11 = static_cast(nb11 / nb10); + p.nb12 = static_cast(nb12 / nb10); + p.nb13 = static_cast(nb13 / nb10); + + p.nb1 = static_cast(nb1 / nb0); + p.nb2 = static_cast(nb2 / nb0); + p.nb3 = static_cast(nb3 / nb0); + + ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, nullptr, dst, GGML_OP_CONV_3D, std::move(p)); +} + static void ggml_vk_conv_2d_dw(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { vk_op_conv2d_dw_push_constants p{}; p.ne = ggml_nelements(dst); @@ -12974,7 +13423,10 @@ static void ggml_vk_conv_2d_dw(ggml_backend_vk_context * ctx, vk_context& subctx static void ggml_vk_leaky_relu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) { const float * op_params = (const float *)dst->op_params; - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_LEAKY_RELU, { (uint32_t)ggml_nelements(src0), 0, op_params[0], 0.0f, 0.0f, 0.0f }); + vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst); + p.param1 = op_params[0]; + + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, nullptr, dst, GGML_OP_LEAKY_RELU, std::move(p)); } #ifdef GGML_VULKAN_RUN_TESTS @@ -14077,6 +14529,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr case GGML_OP_GET_ROWS: ggml_vk_get_rows(ctx, compute_ctx, src0, src1, node); + break; + case GGML_OP_GET_ROWS_BACK: + ggml_vk_get_rows_back(ctx, compute_ctx, src0, src1, node); + break; case GGML_OP_ADD: if (ctx->num_additional_fused_ops) { @@ -14211,6 +14667,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr switch (ggml_get_unary_op(node)) { case GGML_UNARY_OP_ELU: case GGML_UNARY_OP_EXP: + case GGML_UNARY_OP_EXPM1: case GGML_UNARY_OP_SILU: case GGML_UNARY_OP_GELU: case GGML_UNARY_OP_GELU_ERF: @@ -14327,6 +14784,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr case GGML_OP_TIMESTEP_EMBEDDING: ggml_vk_timestep_embedding(ctx, compute_ctx, src0, node); + break; + case GGML_OP_COL2IM_1D: + ggml_vk_col2im_1d(ctx, compute_ctx, src0, node); + break; case GGML_OP_CONV_TRANSPOSE_1D: ggml_vk_conv_transpose_1d(ctx, compute_ctx, src0, src1, node); @@ -14340,6 +14801,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr case GGML_OP_CONV_TRANSPOSE_2D: ggml_vk_conv_2d(ctx, compute_ctx, src0, src1, node); + break; + case GGML_OP_CONV_3D: + ggml_vk_conv_3d(ctx, compute_ctx, src0, src1, node); + break; case GGML_OP_CONV_2D_DW: ggml_vk_conv_2d_dw(ctx, compute_ctx, src0, src1, node); @@ -15725,8 +16190,6 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg // Submit after enough work has accumulated, to overlap CPU cmdbuffer generation with GPU execution. // Estimate the amount of matmul work by looking at the weight matrix size, and submit every 100MB // (and scaled down based on model size, so smaller models submit earlier). - // Also submit at least every 100 nodes, in case there are workloads without as much matmul. - int nodes_per_submit = 100; int submitted_nodes = 0; int submit_count = 0; uint64_t mul_mat_bytes = 0; @@ -15952,7 +16415,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg // Signal the almost_ready fence when the graph is mostly complete (< 20% remaining) bool almost_ready = (cgraph->n_nodes - i) < cgraph->n_nodes / 5; - bool submit = (submitted_nodes >= nodes_per_submit) || + bool submit = ((uint32_t)submitted_nodes >= ctx->device->max_nodes_per_submit) || (mul_mat_bytes_per_submit != 0 && mul_mat_bytes >= mul_mat_bytes_per_submit) || (i + ctx->num_additional_fused_ops >= last_node) || (almost_ready && !ctx->almost_ready_fence_pending); @@ -16600,6 +17063,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm case GGML_OP_UNARY: switch (ggml_get_unary_op(op)) { case GGML_UNARY_OP_EXP: + case GGML_UNARY_OP_EXPM1: case GGML_UNARY_OP_ELU: case GGML_UNARY_OP_GELU: case GGML_UNARY_OP_GELU_ERF: @@ -16620,8 +17084,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm case GGML_UNARY_OP_FLOOR: case GGML_UNARY_OP_TRUNC: case GGML_UNARY_OP_SGN: - return ggml_is_contiguous(op->src[0]) && - (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) && + return (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) && (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) && (op->src[0]->type == op->type); default: @@ -16637,7 +17100,8 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm case GGML_GLU_OP_GEGLU_QUICK: return (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) && (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) && - (op->src[0]->type == op->type); + (op->src[0]->type == op->type) && + (!op->src[1] || op->src[1]->type == op->src[0]->type); default: return false; } @@ -16788,6 +17252,8 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm return false; } } + case GGML_OP_GET_ROWS_BACK: + return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32; case GGML_OP_SET_ROWS: { switch (op->type) { @@ -16884,12 +17350,11 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm case GGML_OP_TRANSPOSE: case GGML_OP_RMS_NORM: return true; - case GGML_OP_NORM: case GGML_OP_GROUP_NORM: return ggml_is_contiguous(op->src[0]); + case GGML_OP_NORM: case GGML_OP_L2_NORM: - return ggml_is_contiguous_rows(op->src[0]) && - op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32; + return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32; case GGML_OP_ADD: case GGML_OP_SUB: case GGML_OP_MUL: @@ -16908,8 +17373,9 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm case GGML_OP_SIN: case GGML_OP_COS: case GGML_OP_CLAMP: - return op->src[0]->type == GGML_TYPE_F32; case GGML_OP_LEAKY_RELU: + return (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) && + op->type == op->src[0]->type; case GGML_OP_OPT_STEP_ADAMW: case GGML_OP_OPT_STEP_SGD: return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32; @@ -16956,8 +17422,14 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm case GGML_OP_SET: return op->src[0]->type == op->src[1]->type && op->src[0]->type == op->type && (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_I32); - case GGML_OP_CONCAT: - return ggml_type_size(op->src[0]->type) == ggml_type_size(GGML_TYPE_F32); + case GGML_OP_CONCAT: { + if (op->src[0]->type != op->src[1]->type || op->src[0]->type != op->type) { + return false; + } + const size_t type_size = ggml_type_size(op->type); + return ggml_blck_size(op->type) == 1 && + (type_size == 1 || type_size == 2 || type_size == 4 || type_size == 8); + } case GGML_OP_ADD1: return (op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32) || (op->src[0]->type == GGML_TYPE_F16 && op->src[1]->type == GGML_TYPE_F32) @@ -17033,7 +17505,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm case GGML_OP_GATED_DELTA_NET: { const uint32_t S_v = op->src[2]->ne[0]; - if (S_v != 32 && S_v != 64 && S_v != 128) { + if (S_v != 16 && S_v != 32 && S_v != 64 && S_v != 128) { return false; } for (int i = 0; i < 6; i++) { @@ -17085,6 +17557,13 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm return op->src[0]->type == GGML_TYPE_F32; case GGML_OP_CONV_TRANSPOSE_1D: return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32; + case GGML_OP_COL2IM_1D: + return (op->src[0]->type == GGML_TYPE_F32 || + op->src[0]->type == GGML_TYPE_F16 || + op->src[0]->type == GGML_TYPE_BF16) && + op->type == op->src[0]->type && + ggml_is_contiguous(op->src[0]) && + ggml_is_contiguous(op); case GGML_OP_CONV_2D: case GGML_OP_CONV_TRANSPOSE_2D: { @@ -17096,6 +17575,13 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm ggml_is_contiguous(op->src[1]) && ggml_is_contiguous(op)); } + case GGML_OP_CONV_3D: + return (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) && + op->src[1]->type == GGML_TYPE_F32 && + op->type == GGML_TYPE_F32 && + ggml_is_contiguous(op->src[0]) && + ggml_is_contiguous(op->src[1]) && + ggml_is_contiguous(op); default: return false; } @@ -17412,9 +17898,9 @@ static bool ggml_vk_device_is_supported(const vk::PhysicalDevice & vkdev) { static bool ggml_vk_khr_cooperative_matrix_support(const vk::PhysicalDeviceProperties& props, const vk::PhysicalDeviceDriverProperties& driver_props, vk_device_architecture arch) { switch (props.vendorID) { case VK_VENDOR_ID_INTEL: - // Only allowing Xe2 GPU at the moment since Xe2 GPU can gain significant performance boost, - // while some older hardware (ex. Arc A770) has performance regressions - return arch == vk_device_architecture::INTEL_XE2; + // Only allowing Xe2/Xe3 GPU and integrated Xe GPUs at the moment since older hardware (ex. Arc A770) has performance regressions. + return (arch == vk_device_architecture::INTEL_XE2) || + (arch == vk_device_architecture::INTEL_XE1 && props.deviceType == vk::PhysicalDeviceType::eIntegratedGpu && driver_props.driverID == vk::DriverId::eIntelProprietaryWindows); case VK_VENDOR_ID_AMD: if (driver_props.driverID == vk::DriverId::eAmdProprietary || driver_props.driverID == vk::DriverId::eAmdOpenSource) { // Workaround for AMD proprietary driver reporting support on all GPUs @@ -17462,6 +17948,8 @@ static uint32_t ggml_vk_intel_shader_core_count(const vk::PhysicalDevice& vkdev) case 0xE20B: // B580 case 0xE211: // Pro B60 return 20; + case 0xB080: // PTL Xe3 LPG 2x6 (12 subslices) + return 12; default: return 0; } @@ -17767,6 +18255,9 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph * case GGML_UNARY_OP_EXP: tensor_clone = ggml_exp(ggml_ctx, src_clone[0]); break; + case GGML_UNARY_OP_EXPM1: + tensor_clone = ggml_expm1(ggml_ctx, src_clone[0]); + break; case GGML_UNARY_OP_ELU: tensor_clone = ggml_elu(ggml_ctx, src_clone[0]); break; @@ -17913,6 +18404,11 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph * const int32_t p0 = tensor->op_params[1]; const int32_t d0 = tensor->op_params[2]; tensor_clone = ggml_conv_transpose_1d(ggml_ctx, src_clone[0], src_clone[1], s0, p0, d0); + } else if (tensor->op == GGML_OP_COL2IM_1D) { + const int32_t stride = tensor->op_params[0]; + const int32_t oc = tensor->op_params[1]; + const int32_t p0 = tensor->op_params[2]; + tensor_clone = ggml_col2im_1d(ggml_ctx, src_clone[0], stride, oc, p0); } else if (tensor->op == GGML_OP_POOL_2D) { enum ggml_op_pool op = static_cast(tensor->op_params[0]); const int32_t k0 = tensor->op_params[1]; @@ -17931,6 +18427,20 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph * const int32_t d0 = tensor->op_params[4]; const int32_t d1 = tensor->op_params[5]; tensor_clone = ggml_conv_2d(ggml_ctx, src_clone[0], src_clone[1], s0, s1, p0, p1, d0, d1); + } else if (tensor->op == GGML_OP_CONV_3D) { + const int32_t s0 = tensor->op_params[0]; + const int32_t s1 = tensor->op_params[1]; + const int32_t s2 = tensor->op_params[2]; + const int32_t p0 = tensor->op_params[3]; + const int32_t p1 = tensor->op_params[4]; + const int32_t p2 = tensor->op_params[5]; + const int32_t d0 = tensor->op_params[6]; + const int32_t d1 = tensor->op_params[7]; + const int32_t d2 = tensor->op_params[8]; + const int32_t IC = tensor->op_params[9]; + const int32_t N = tensor->op_params[10]; + const int32_t OC = tensor->op_params[11]; + tensor_clone = ggml_conv_3d_direct(ggml_ctx, src_clone[0], src_clone[1], s0, s1, s2, p0, p1, p2, d0, d1, d2, IC, N, OC); } else if (tensor->op == GGML_OP_CONV_2D_DW) { const int32_t s0 = tensor->op_params[0]; const int32_t s1 = tensor->op_params[1]; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/abs.comp b/ggml/src/ggml-vulkan/vulkan-shaders/abs.comp deleted file mode 100644 index 07bd1c18dadb..000000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/abs.comp +++ /dev/null @@ -1,21 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - - data_d[i] = D_TYPE(abs(float(data_a[i]))); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp b/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp deleted file mode 100644 index 0028d3721d73..000000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/ceil.comp +++ /dev/null @@ -1,22 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - - const float x = float(data_a[i]); - data_d[i] = D_TYPE(ceil(x)); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp b/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp deleted file mode 100644 index 653431895e70..000000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/clamp.comp +++ /dev/null @@ -1,17 +0,0 @@ -#version 450 - -#include "types.glsl" -#include "generic_unary_head.glsl" - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -void main() { - const uint idx = get_idx(); - - if (idx >= p.ne) { - return; - } - - const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]); - data_d[get_doffset() + dst_idx(idx)] = D_TYPE(val < p.param1 ? p.param1 : (val > p.param2 ? p.param2 : val)); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/col2im_1d.comp b/ggml/src/ggml-vulkan/vulkan-shaders/col2im_1d.comp new file mode 100644 index 000000000000..a23de380f071 --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/col2im_1d.comp @@ -0,0 +1,61 @@ +#version 450 + +#include "types.glsl" + +layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; // columns: [K_OC, T_in] +layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; // output: [T_out, OC] + +layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in; + +layout (push_constant) uniform parameter { + uint32_t T_out; + uint32_t OC; + uint32_t K_OC; + uint32_t T_in; + uint32_t K; + int32_t stride; + int32_t p0; +} p; + +// Load A_TYPE to float +float load_col(uint32_t idx) { +#if defined(DATA_A_BF16) + return bf16_to_fp32(uint32_t(data_a[idx])); +#else + return float(data_a[idx]); +#endif +} + +// Store float as D_TYPE +void store_dst(uint32_t idx, float v) { +#if defined(DATA_A_BF16) + data_d[idx] = D_TYPE(fp32_to_bf16(v)); +#else + data_d[idx] = D_TYPE(v); +#endif +} + +void main() { + const uint32_t t_out = gl_GlobalInvocationID.x; + const uint32_t oc = gl_GlobalInvocationID.y; + if (t_out >= p.T_out || oc >= p.OC) return; + + const int32_t t_abs = int32_t(t_out) + p.p0; // absolute position in uncropped signal + + // Gather: only the ceil(K/stride) columns that scatter into t_abs, no modulo + int32_t t_in_min = (t_abs - int32_t(p.K) + p.stride) / p.stride; + if (t_in_min < 0) t_in_min = 0; + int32_t t_in_max = t_abs / p.stride; + if (t_in_max >= int32_t(p.T_in)) t_in_max = int32_t(p.T_in) - 1; + + float val = 0.0; + for (int32_t t_in = t_in_min; t_in <= t_in_max; t_in++) { + int32_t k = t_abs - t_in * p.stride; + // col layout: [K_OC, T_in], column index = oc * K + k + uint32_t col_idx = (oc * p.K + uint32_t(k)) + uint32_t(t_in) * p.K_OC; + val += load_col(col_idx); + } + + // dst layout: [T_out, OC], element (t_out, oc) = t_out + oc * T_out + store_dst(t_out + oc * p.T_out, val); +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp index 1428ef68d81c..99400098bf2b 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/conv2d_mm.comp @@ -158,7 +158,7 @@ const uint32_t Csh_stride = BS_NPQ; #ifdef COOPMAT const uint32_t Csh_len = BS_K * Csh_stride; #else -const uint32_t Csh_len = csh_store != 0 ? BS_K * Csh_stride : 1; +const uint32_t Csh_len = csh_store != 0 ? BS_K * Csh_stride : 8; // 8 to workaround compiler bug #endif shared SHMEM_TYPE Csh[Csh_len]; // K x NPQ #endif diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/conv3d_mm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/conv3d_mm.comp new file mode 100644 index 000000000000..f66f299f6dae --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/conv3d_mm.comp @@ -0,0 +1,431 @@ +#version 450 + +#extension GL_EXT_control_flow_attributes : enable +#ifdef COOPMAT2 +#extension GL_NV_cooperative_matrix2 : enable +#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require +#extension GL_KHR_memory_scope_semantics : enable +#endif + +#ifdef COOPMAT +#extension GL_KHR_cooperative_matrix : enable +#extension GL_KHR_shader_subgroup_basic : enable +#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require +#extension GL_KHR_memory_scope_semantics : enable +#endif + +#include "types.glsl" + +// shape notation: [dim(N), ..., dim(0)] -- stride(dim(j)) >= stride(dim(i)) if i > j +layout(binding = 0) readonly buffer A { + A_TYPE knl_data[]; +}; // src0 - kernel: [KW, KH, KD, IC*OC] + +layout(binding = 1) readonly buffer B { + B_TYPE src_data[]; +}; // src1 - input: [IW, IH, ID, IC*N] -- channel_first format + +layout(binding = 2) writeonly buffer D { + D_TYPE dst_data[]; +}; // dst - result: [OW, OH, OD, OC*N] + +layout(push_constant) uniform parameter { + // I/O channels, batch size + uint32_t OC; + uint32_t IC; + uint32_t N; + + // Tensor spatial sizes: input, output + uint32_t IW; + uint32_t IH; + uint32_t ID; + uint32_t OW; + uint32_t OH; + uint32_t OD; + + // Strides in elements + uint32_t nb01; + uint32_t nb02; + uint32_t nb03; + + uint32_t nb11; + uint32_t nb12; + uint32_t nb13; + + uint32_t nb1; + uint32_t nb2; + uint32_t nb3; + + // fastdiv helper values + uint32_t OWmp; uint32_t OWL; + uint32_t OWOHmp; uint32_t OWOHL; + uint32_t OWOHODmp; uint32_t OWOHODL; +} + +p; + +layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; +// Blocktile sizes +layout(constant_id = 1) const uint BS_K = 128; +layout(constant_id = 2) const uint BS_CRS = 16; +layout(constant_id = 3) const uint BS_NPQ = 128; +// Thread-tile sizes +layout(constant_id = 4) const uint TS_K = 8; +layout(constant_id = 5) const uint SHMEM_PAD = 4; +// Stride, padding, dilation +layout(constant_id = 6) const uint s0 = 1; +layout(constant_id = 7) const uint s1 = 1; +layout(constant_id = 8) const uint s2 = 1; +layout(constant_id = 9) const uint p0 = 0; +layout(constant_id = 10) const uint p1 = 0; +layout(constant_id = 11) const uint p2 = 0; +layout(constant_id = 12) const uint d0 = 1; +layout(constant_id = 13) const uint d1 = 1; +layout(constant_id = 14) const uint d2 = 1; +// Kernel spatial sizes +layout(constant_id = 15) const uint KW = 1; +layout(constant_id = 16) const uint KH = 1; +layout(constant_id = 17) const uint KD = 1; +// when set, skip bounds checks and address clamps (K/CRS/NPQ are tile-aligned) +layout(constant_id = 18) const uint aligned = 0; +// stage cm2 result through shmem (Csh) for coalesced stores. cm1 always does this. +layout(constant_id = 19) const uint csh_store = 0; + +#ifdef COOPMAT +// cm1 subgroup tile: each subgroup computes a WM x WN region as a grid of +// TM x TN x TK fragments. Requires WM%TM == WN%TN == BS_K%WM == BS_NPQ%WN == +// BS_CRS%TK == 0, and WG_SIZE == (BS_K/WM) * (BS_NPQ/WN) * subgroup_size. +layout(constant_id = 20) const uint WM = 32; +layout(constant_id = 21) const uint WN = 32; +const uint TM = 16; +const uint TN = 16; +const uint TK = 16; +const uint cms_per_row = WM / TM; +const uint cms_per_col = WN / TN; +const uint warps_M = BS_K / WM; +const uint warps_N = BS_NPQ / WN; +#endif + +// without padding, ID_idx/IH_idx/IW_idx are in bounds by construction +const bool dhw_in_bounds = (p0 == 0) && (p1 == 0) && (p2 == 0); + +uint32_t tid = gl_LocalInvocationID.x; +const uint32_t WG_SIZE = gl_WorkGroupSize.x; + +uint splitWork(uint work_size, uint block_size) { + return (block_size + work_size - 1) / block_size; +} + +uint32_t K = p.OC; +uint32_t CRS = p.IC * KD * KH * KW; +uint32_t NPQ = p.N * p.OD * p.OH * p.OW; + +// Number of blocktiles per input +uint32_t NB_CRS = splitWork(CRS, BS_CRS); + +#if defined(COOPMAT2) || defined(COOPMAT) +#define SHMEM_TYPE float16_t +#else +#define SHMEM_TYPE float +#endif + +const uint32_t Ash_stride = BS_CRS + SHMEM_PAD; +const uint32_t Bsh_stride = BS_NPQ + SHMEM_PAD; + +const uint32_t Ash_len = BS_K * Ash_stride; +const uint32_t Bsh_len = BS_CRS * Bsh_stride; + +shared SHMEM_TYPE Ash[Ash_len]; // K x CRS +shared SHMEM_TYPE Bsh[Bsh_len]; // CRS x NPQ + +#if defined(COOPMAT2) || defined(COOPMAT) +// stage matC through shmem so global stores are row-major (NPQ-contiguous) +const uint32_t Csh_stride = BS_NPQ; +#ifdef COOPMAT +const uint32_t Csh_len = BS_K * Csh_stride; +#else +const uint32_t Csh_len = csh_store != 0 ? BS_K * Csh_stride : 8; // 8 to workaround compiler bug +#endif +shared SHMEM_TYPE Csh[Csh_len]; // K x NPQ +#endif + +// Threadtile sizes +const uint32_t TS_NPQ = BS_K * BS_NPQ / WG_SIZE / TS_K; + +// Number of threadtiles per blocktile +const uint32_t NT_NPQ = BS_NPQ / TS_NPQ; + +/* +Compute +KxCRS @ CRSxNPQ = K x NPQ +K=OC +C=IC +D,R,S=KD,KH,KW +Z,P,Q=OD,OH,OW +*/ + +uint32_t B_idx_K = gl_WorkGroupID.x; +uint32_t B_idx_NPQ = gl_WorkGroupID.y + gl_WorkGroupID.z * 512; + +uint32_t T_y = tid / NT_NPQ; +uint32_t T_x = tid % NT_NPQ; + +uint32_t Ar = tid / BS_CRS; +uint32_t Ac = tid % BS_CRS; +const uint32_t ArpWg = WG_SIZE / BS_CRS; + +uint32_t Br = tid / BS_NPQ; +uint32_t Bc = tid % BS_NPQ; +const uint32_t BrpWg = WG_SIZE / BS_NPQ; + +// see init_fastdiv_values in ggml-vulkan.cpp +uint fastdiv(uint n, uint mp, uint L) { + uint msbs, lsbs; + // msbs = mulhi(n, mp) + umulExtended(n, mp, msbs, lsbs); + return (msbs + n) >> L; +} + +void split_crs(uint32_t crs_idx, out uint32_t ic, out uint32_t kd, out uint32_t kh, out uint32_t kw) { + const uint32_t KHKW = KH * KW; + const uint32_t KDKHKW = KD * KHKW; + ic = crs_idx / KDKHKW; + uint32_t rem = crs_idx - ic * KDKHKW; + kd = rem / KHKW; + rem = rem - kd * KHKW; + kh = rem / KW; + kw = rem - kh * KW; +} + +void split_npq(uint32_t npq_idx, out uint32_t n, out uint32_t od, out uint32_t oh, out uint32_t ow) { + const uint32_t OWOH = p.OW * p.OH; + n = fastdiv(npq_idx, p.OWOHODmp, p.OWOHODL); + uint32_t rem = npq_idx - n * p.OD * OWOH; + od = fastdiv(rem, p.OWOHmp, p.OWOHL); + rem = rem - od * OWOH; + oh = fastdiv(rem, p.OWmp, p.OWL); + ow = rem - oh * p.OW; +} + +#ifdef COOPMAT2 +#define ACC_TYPE float16_t + +ACC_TYPE perElemOpStore(const in uint32_t r, const in uint32_t c, const in ACC_TYPE elem) +{ + uint32_t K_idx = B_idx_K * BS_K + r; + uint32_t NPQ_idx = B_idx_NPQ * BS_NPQ + c; + uint32_t N_idx; + uint32_t OD_idx; + uint32_t OH_idx; + uint32_t OW_idx; + split_npq(NPQ_idx, N_idx, OD_idx, OH_idx, OW_idx); + uint32_t dst_idx = OW_idx + OH_idx * p.nb1 + OD_idx * p.nb2 + (N_idx * p.OC + K_idx) * p.nb3; + if (aligned != 0 || (K_idx < K && NPQ_idx < NPQ)) { + dst_data[dst_idx] = D_TYPE(elem); + } + return elem; +} +#endif + +void main() { + if (B_idx_NPQ * BS_NPQ >= NPQ) { + return; + } + +#ifdef COOPMAT2 + coopmat matC; + matC = coopmat(0.0); +#elif defined(COOPMAT) + coopmat sums[cms_per_row * cms_per_col]; + [[unroll]] for (uint i = 0; i < cms_per_row * cms_per_col; i++) { + sums[i] = coopmat(0.0); + } + const uint warp_r = gl_SubgroupID / warps_N; + const uint warp_c = gl_SubgroupID % warps_N; +#else + float regC[TS_K][TS_NPQ]; + for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) { + for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) { + regC[T_ly][T_lx] = 0.0; + } + } +#endif + /* Advance block in CRS dim */ + [[dont_unroll]] for (uint32_t B_idx_CRS = 0; B_idx_CRS < NB_CRS; B_idx_CRS++) { + uint32_t CRS_idx_a = B_idx_CRS * BS_CRS + Ac; + uint32_t IC_idx_a; + uint32_t KD_idx_a; + uint32_t KH_idx_a; + uint32_t KW_idx_a; + split_crs(CRS_idx_a, IC_idx_a, KD_idx_a, KH_idx_a, KW_idx_a); + + /* Load kernel to A_block: (BS_K x BS_CRS)*/ + UNROLL for (uint32_t r_offset = 0; r_offset < BS_K; r_offset += ArpWg) { + uint32_t B_ly = r_offset + Ar; + uint32_t B_lx = Ac; + uint32_t K_idx = B_idx_K * BS_K + B_ly; /* Global K_idx (row index of A)*/ + uint32_t knl_idx = KW_idx_a + KH_idx_a * p.nb01 + KD_idx_a * p.nb02 + (K_idx * p.IC + IC_idx_a) * p.nb03; + if (aligned == 0) { + knl_idx = min(knl_idx, K * CRS - 1); + } + float val = knl_data[knl_idx]; + if (aligned == 0 && (K_idx >= K || CRS_idx_a >= CRS)) { + val = 0.0; + } + Ash[B_ly * Ash_stride + B_lx] = SHMEM_TYPE(val); + } + /* Load input to B_block: (BS_CRS x BS_NPQ) */ + UNROLL for (uint32_t r_offset = 0; r_offset < BS_CRS; r_offset += BrpWg) { + uint32_t B_ly = r_offset + Br; /* Row index of B block */ + uint32_t B_lx = Bc; + uint32_t NPQ_idx = B_idx_NPQ * BS_NPQ + B_lx; /* Global NPQ index (column index of B) */ + uint32_t N_idx; + uint32_t OD_idx; + uint32_t OH_idx; + uint32_t OW_idx; + split_npq(NPQ_idx, N_idx, OD_idx, OH_idx, OW_idx); + + uint32_t CRS_idx_b = B_idx_CRS * BS_CRS + B_ly; + uint32_t IC_idx_b; + uint32_t KD_idx_b; + uint32_t KH_idx_b; + uint32_t KW_idx_b; + split_crs(CRS_idx_b, IC_idx_b, KD_idx_b, KH_idx_b, KW_idx_b); + + uint32_t ID_idx = OD_idx * s2 + KD_idx_b * d2 - p2; + uint32_t IH_idx = OH_idx * s1 + KH_idx_b * d1 - p1; + uint32_t IW_idx = OW_idx * s0 + KW_idx_b * d0 - p0; + + uint32_t src_idx = IW_idx + IH_idx * p.nb11 + ID_idx * p.nb12 + (N_idx * p.IC + IC_idx_b) * p.nb13; + // skip clamp when address can't go OOB + if (aligned == 0 || !dhw_in_bounds) { + src_idx = min(src_idx, p.IC * p.N * p.IW * p.IH * p.ID - 1); + } + float val = src_data[src_idx]; + bool oob = false; + if (aligned == 0 && (CRS_idx_b >= CRS || NPQ_idx >= NPQ)) { + oob = true; + } + // also catches lower-bound underflow (idx wraps to 0x80000000+) + if (!dhw_in_bounds && (ID_idx >= p.ID || IH_idx >= p.IH || IW_idx >= p.IW)) { + oob = true; + } + if (oob) { + val = 0.0; + } + Bsh[B_ly * Bsh_stride + B_lx] = SHMEM_TYPE(val); + } + barrier(); +#ifdef COOPMAT2 + coopmat matA; + coopmat matB; + + coopMatLoad(matA, Ash, 0, Ash_stride, gl_CooperativeMatrixLayoutRowMajor); + coopMatLoad(matB, Bsh, 0, Bsh_stride, gl_CooperativeMatrixLayoutRowMajor); + matC = coopMatMulAdd(matA, matB, matC); +#elif defined(COOPMAT) + // each subgroup multiplies its grid of fragments per TK-sized CRS chunk + [[unroll]] for (uint k_step = 0; k_step < BS_CRS / TK; k_step++) { + coopmat cache_a[cms_per_row]; + [[unroll]] for (uint cm_row = 0; cm_row < cms_per_row; cm_row++) { + const uint a_off = (warp_r * WM + cm_row * TM) * Ash_stride + k_step * TK; + coopMatLoad(cache_a[cm_row], Ash, a_off, Ash_stride, gl_CooperativeMatrixLayoutRowMajor); + } + [[unroll]] for (uint cm_col = 0; cm_col < cms_per_col; cm_col++) { + coopmat cache_b; + const uint b_off = k_step * TK * Bsh_stride + warp_c * WN + cm_col * TN; + coopMatLoad(cache_b, Bsh, b_off, Bsh_stride, gl_CooperativeMatrixLayoutRowMajor); + [[unroll]] for (uint cm_row = 0; cm_row < cms_per_row; cm_row++) { + sums[cm_col * cms_per_row + cm_row] = coopMatMulAdd(cache_a[cm_row], cache_b, sums[cm_col * cms_per_row + cm_row]); + } + } + } +#else + if (T_y * TS_K < K) { + UNROLL for (uint32_t CRS_lidx = 0; CRS_lidx < BS_CRS; CRS_lidx++) { + float regA[TS_K]; + float regB[TS_NPQ]; + for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) { + regA[T_ly] = Ash[(T_y * TS_K + T_ly) * Ash_stride + CRS_lidx]; + } + for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) { + regB[T_lx] = Bsh[CRS_lidx * Bsh_stride + T_x * TS_NPQ + T_lx]; + } + for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) { + for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) { + regC[T_ly][T_lx] = fma(regA[T_ly], regB[T_lx], regC[T_ly][T_lx]); + } + } + } + } +#endif + barrier(); + } + /* Save C* */ +#if defined(COOPMAT2) || defined(COOPMAT) + // stage matC into Csh, then write to dst with coalesced NPQ-contiguous stores +#ifdef COOPMAT + const bool use_staged_store = true; +#else + const bool use_staged_store = (csh_store != 0); +#endif + if (use_staged_store) { +#ifdef COOPMAT + // cm1: each subgroup stores its fragment grid into its Csh slot + [[unroll]] for (uint cm_row = 0; cm_row < cms_per_row; cm_row++) { + [[unroll]] for (uint cm_col = 0; cm_col < cms_per_col; cm_col++) { + const uint csh_off = (warp_r * WM + cm_row * TM) * Csh_stride + warp_c * WN + cm_col * TN; + coopMatStore(sums[cm_col * cms_per_row + cm_row], Csh, csh_off, Csh_stride, gl_CooperativeMatrixLayoutRowMajor); + } + } +#else + coopMatStore(matC, Csh, 0, Csh_stride, gl_CooperativeMatrixLayoutRowMajor); +#endif + barrier(); + + // cooperative shmem->global: WG threads spread across BS_NPQ (the + // contiguous direction of dst), each iter covers store_rows_per_iter K-rows + const uint32_t store_rows_per_iter = WG_SIZE / BS_NPQ; + const uint32_t store_iters = BS_K / store_rows_per_iter; + const uint32_t k_thread_offset = tid / BS_NPQ; + const uint32_t npq_thread = tid % BS_NPQ; + [[unroll]] for (uint32_t i = 0; i < store_iters; i++) { + uint32_t k_local = i * store_rows_per_iter + k_thread_offset; + uint32_t K_idx = B_idx_K * BS_K + k_local; + uint32_t NPQ_idx = B_idx_NPQ * BS_NPQ + npq_thread; + uint32_t N_idx; + uint32_t OD_idx; + uint32_t OH_idx; + uint32_t OW_idx; + split_npq(NPQ_idx, N_idx, OD_idx, OH_idx, OW_idx); + uint32_t dst_idx = OW_idx + OH_idx * p.nb1 + OD_idx * p.nb2 + (N_idx * p.OC + K_idx) * p.nb3; + if (aligned != 0 || (K_idx < K && NPQ_idx < NPQ)) { + dst_data[dst_idx] = D_TYPE(Csh[k_local * Csh_stride + npq_thread]); + } + } + } +#ifdef COOPMAT2 + else { + coopMatPerElementNV(matC, matC, perElemOpStore); + } +#endif +#else + if (T_y * TS_K < K) { + for (uint32_t T_ly = 0; T_ly < TS_K; T_ly++) { + for (uint32_t T_lx = 0; T_lx < TS_NPQ; T_lx++) { + uint32_t K_idx = B_idx_K * BS_K + T_y * TS_K + T_ly; + uint32_t NPQ_idx = B_idx_NPQ * BS_NPQ + T_x * TS_NPQ + T_lx; + uint32_t N_idx; + uint32_t OD_idx; + uint32_t OH_idx; + uint32_t OW_idx; + split_npq(NPQ_idx, N_idx, OD_idx, OH_idx, OW_idx); + uint32_t dst_idx = OW_idx + OH_idx * p.nb1 + OD_idx * p.nb2 + (N_idx * p.OC + K_idx) * p.nb3; + if (aligned != 0 || (K_idx < K && NPQ_idx < NPQ)) { + dst_data[dst_idx] = D_TYPE(regC[T_ly][T_lx]); + } + } + } + } +#endif +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp b/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp deleted file mode 100644 index db6865db9812..000000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/cos.comp +++ /dev/null @@ -1,17 +0,0 @@ -#version 450 - -#include "types.glsl" -#include "generic_unary_head.glsl" - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -void main() { - const uint idx = get_idx(); - - if (idx >= p.ne) { - return; - } - - const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]); - data_d[get_doffset() + dst_idx(idx)] = D_TYPE(cos(val)); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp b/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp index 79761324f555..249e6b16eef1 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/diag.comp @@ -12,11 +12,11 @@ void main() { return; } - const uint i13 = fastdiv(idx, p.ne1_012mp, p.ne1_012L); + const uint i13 = fastdiv(idx, p.ne1_012mp, fastdiv_L(p.ne1_Ls, 0)); const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10; - const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, p.ne1_01L); + const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, fastdiv_L(p.ne1_Ls, 1)); const uint i12_offset = i12*p.ne11*p.ne10; - const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, p.ne1_0L); + const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, fastdiv_L(p.ne1_Ls, 2)); const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10; if (i10 == i11) { diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp b/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp deleted file mode 100644 index 84dcbd8c88fc..000000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/elu.comp +++ /dev/null @@ -1,27 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - - float x = float(data_a[i]); - - if (x < 0.0f) { - x = exp(x) - 1; - } - - data_d[i] = D_TYPE(x); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp b/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp deleted file mode 100644 index c7cf5ec68f76..000000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/exp.comp +++ /dev/null @@ -1,20 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - data_d[i] = D_TYPE(exp(float(data_a[i]))); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp index 91fb07c93e7e..3192130ccf57 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp @@ -463,6 +463,7 @@ void main() { } rowmaxf = max(rowmaxf, float(Sf[r][c])); } + rowmaxf += FATTN_KQ_MAX_OFFSET; float Moldf = Mf[r]; // M = max(rowmax, Mold) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp index 23ae3833e522..16178e577024 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp @@ -352,6 +352,7 @@ void main() { } rowmaxf = max(rowmaxf, float(sfsh[r_vec + (c * cols_per_iter + col_tid) * sfshstride][r_comp])); } + rowmaxf += FATTN_KQ_MAX_OFFSET; float Moldf = Mf[r]; // Compute max across the row diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp b/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp deleted file mode 100644 index 20017eb1843e..000000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/floor.comp +++ /dev/null @@ -1,22 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - - const float x = float(data_a[i]); - data_d[i] = D_TYPE(floor(x)); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp b/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp deleted file mode 100644 index a95c2525c8d8..000000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/gelu.comp +++ /dev/null @@ -1,25 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const float GELU_COEF_A = 0.044715f; - const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - - const float xi = float(data_a[i]); - const float val = SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi); - data_d[i] = D_TYPE(0.5f*xi*(2.0f - 2.0f / (exp(2 * val) + 1))); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp b/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp deleted file mode 100644 index 58375aba09fd..000000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/gelu_erf.comp +++ /dev/null @@ -1,39 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - // based on Abramowitz and Stegun formula 7.1.26 or similar Hastings' approximation - // ref: https://www.johndcook.com/blog/python_erf/ - const float p_erf = 0.3275911f; - const float a1_erf = 0.254829592f; - const float a2_erf = -0.284496736f; - const float a3_erf = 1.421413741f; - const float a4_erf = -1.453152027f; - const float a5_erf = 1.061405429f; - - const float SQRT_2_INV = 0.70710678118654752440084436210484f; - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - - const float a = float(data_a[i]); - const float a_div_sqr2 = a * SQRT_2_INV; - const float sign_x = sign(a_div_sqr2); - const float x = abs(a_div_sqr2); - const float t = 1.0f / (1.0f + p_erf * x); - const float y = 1.0f - (((((a5_erf * t + a4_erf) * t) + a3_erf) * t + a2_erf) * t + a1_erf) * t * exp(-x * x); - const float erf_approx = sign_x * y; - - data_d[i] = D_TYPE(0.5f * a * (1.0f + erf_approx)); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp b/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp deleted file mode 100644 index bfdfe2182df6..000000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/gelu_quick.comp +++ /dev/null @@ -1,23 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const float GELU_QUICK_COEF = -1.702f; - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - - const float x = float(data_a[i]); - data_d[i] = D_TYPE(x * (1.0f / (1.0f + exp(GELU_QUICK_COEF * x)))); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.glsl index cc181fda870f..9d4176f3f967 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.glsl +++ b/ggml/src/ggml-vulkan/vulkan-shaders/generic_unary_head.glsl @@ -7,14 +7,12 @@ layout (push_constant) uniform parameter uint ne00; uint ne01; uint ne02; uint ne03; uint nb00; uint nb01; uint nb02; uint nb03; uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13; uint misalign_offsets; - float param1; float param2; + float param1; float param2; float param3; float param4; - uint ne0_012mp; uint ne0_012L; - uint ne0_01mp; uint ne0_01L; - uint ne0_0mp; uint ne0_0L; - uint ne1_012mp; uint ne1_012L; - uint ne1_01mp; uint ne1_01L; - uint ne1_0mp; uint ne1_0L; + // The three L values are packed as bytes to keep this layout under the 128B + // push constant limit while still leaving room for four float parameters. + uint ne0_012mp; uint ne0_01mp; uint ne0_0mp; uint ne0_Ls; + uint ne1_012mp; uint ne1_01mp; uint ne1_0mp; uint ne1_Ls; } p; layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; @@ -42,42 +40,46 @@ uint fastdiv(uint n, uint mp, uint L) { return (msbs + n) >> L; } +uint fastdiv_L(uint packed, uint slot) { + return (packed >> (slot * 8)) & 0x3Fu; +} + uint src0_idx(uint idx) { - const uint i03 = fastdiv(idx, p.ne0_012mp, p.ne0_012L); + const uint i03 = fastdiv(idx, p.ne0_012mp, fastdiv_L(p.ne0_Ls, 0)); const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00; - const uint i02 = fastdiv(idx - i03_offset, p.ne0_01mp, p.ne0_01L); + const uint i02 = fastdiv(idx - i03_offset, p.ne0_01mp, fastdiv_L(p.ne0_Ls, 1)); const uint i02_offset = i02*p.ne01*p.ne00; - const uint i01 = fastdiv(idx - i03_offset - i02_offset, p.ne0_0mp, p.ne0_0L); + const uint i01 = fastdiv(idx - i03_offset - i02_offset, p.ne0_0mp, fastdiv_L(p.ne0_Ls, 2)); const uint i00 = idx - i03_offset - i02_offset - i01*p.ne00; return i03*p.nb03 + i02*p.nb02 + i01*p.nb01 + i00*p.nb00; } uint dst_idx(uint idx) { - const uint i13 = fastdiv(idx, p.ne1_012mp, p.ne1_012L); + const uint i13 = fastdiv(idx, p.ne1_012mp, fastdiv_L(p.ne1_Ls, 0)); const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10; - const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, p.ne1_01L); + const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, fastdiv_L(p.ne1_Ls, 1)); const uint i12_offset = i12*p.ne11*p.ne10; - const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, p.ne1_0L); + const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, fastdiv_L(p.ne1_Ls, 2)); const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10; return i13*p.nb13 + i12*p.nb12 + i11*p.nb11 + i10*p.nb10; } uint src0_idx_quant(uint idx, uint qk) { - const uint i03 = fastdiv(idx, p.ne0_012mp, p.ne0_012L); + const uint i03 = fastdiv(idx, p.ne0_012mp, fastdiv_L(p.ne0_Ls, 0)); const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00; - const uint i02 = fastdiv(idx - i03_offset, p.ne0_01mp, p.ne0_01L); + const uint i02 = fastdiv(idx - i03_offset, p.ne0_01mp, fastdiv_L(p.ne0_Ls, 1)); const uint i02_offset = i02*p.ne01*p.ne00; - const uint i01 = fastdiv(idx - i03_offset - i02_offset, p.ne0_0mp, p.ne0_0L); + const uint i01 = fastdiv(idx - i03_offset - i02_offset, p.ne0_0mp, fastdiv_L(p.ne0_Ls, 2)); const uint i00 = idx - i03_offset - i02_offset - i01*p.ne00; return i03*p.nb03 + i02*p.nb02 + i01*p.nb01 + (i00/qk)*p.nb00; } uint dst_idx_quant(uint idx, uint qk) { - const uint i13 = fastdiv(idx, p.ne1_012mp, p.ne1_012L); + const uint i13 = fastdiv(idx, p.ne1_012mp, fastdiv_L(p.ne1_Ls, 0)); const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10; - const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, p.ne1_01L); + const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, fastdiv_L(p.ne1_Ls, 1)); const uint i12_offset = i12*p.ne11*p.ne10; - const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, p.ne1_0L); + const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, fastdiv_L(p.ne1_Ls, 2)); const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10; return i13*p.nb13 + i12*p.nb12 + i11*p.nb11 + (i10/qk)*p.nb10; } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_back.comp b/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_back.comp new file mode 100644 index 000000000000..7e3d8a281972 --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/get_rows_back.comp @@ -0,0 +1,25 @@ +#version 450 + +#include "types.glsl" +#include "generic_binary_head.glsl" + +layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in; + +void main() { + const uint col = gl_GlobalInvocationID.x; + + if (col >= p.ne20) { + return; + } + + for (uint row = gl_GlobalInvocationID.y; row < p.ne21; row += gl_WorkGroupSize.y * gl_NumWorkGroups.y) { + float sum = 0.0f; + for (uint i = 0; i < p.ne10; ++i) { + if (data_b[get_boffset() + i*p.nb10] == int(row)) { + sum += data_a[get_aoffset() + i*p.nb01 + col*p.nb00]; + } + } + + data_d[get_doffset() + row*p.nb21 + col*p.nb20] = sum; + } +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.glsl index d8fdd8f7b5e7..c3cae736f977 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.glsl +++ b/ggml/src/ggml-vulkan/vulkan-shaders/glu_head.glsl @@ -15,14 +15,33 @@ layout (push_constant) uniform parameter uint mode; float alpha; float limit; + uint nb00; uint nb01; uint nb02; uint nb03; - uint ne01; - uint ne02; + uint nb10; uint nb11; uint nb12; uint nb13; - uint ne11; - uint ne12; + uint nb20; + uint nb21; + uint nb22; + uint nb23; + uint ne21; + uint ne22; + uint misalign_offsets; + uint ne2_012mp; uint ne2_012L; + uint ne2_01mp; uint ne2_01L; + uint ne2_0mp; uint ne2_0L; } p; + +uint get_aoffset() { return p.misalign_offsets >> 16; } +uint get_boffset() { return (p.misalign_offsets >> 8) & 0xFF; } +uint get_doffset() { return p.misalign_offsets & 0xFF; } + +// see init_fastdiv_values in ggml-vulkan.cpp +uint fastdiv(uint n, uint mp, uint L) { + uint msbs, lsbs; + umulExtended(n, mp, msbs, lsbs); + return (msbs + n) >> L; +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.glsl index 359461306a5d..14c5e7a54a97 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.glsl +++ b/ggml/src/ggml-vulkan/vulkan-shaders/glu_main.glsl @@ -5,35 +5,31 @@ void main() { return; } - const uint row = i / p.ne20; - const uint col = i - row * p.ne20; + const uint i23 = fastdiv(i, p.ne2_012mp, p.ne2_012L); + const uint i23_offset = i23 * p.ne22*p.ne21*p.ne20; + const uint i22 = fastdiv(i - i23_offset, p.ne2_01mp, p.ne2_01L); + const uint i22_offset = i22*p.ne21*p.ne20; + const uint i21 = fastdiv(i - i23_offset - i22_offset, p.ne2_0mp, p.ne2_0L); + const uint i20 = i - i23_offset - i22_offset - i21*p.ne20; - const uint i3 = row / (p.ne01 * p.ne02); - const uint i2 = (row % (p.ne01 * p.ne02)) / p.ne01; - const uint i1 = row % p.ne01; - const uint src_idx = i3 * p.nb03 + i2 * p.nb02 + i1 * p.nb01 + col; - - const uint dst_i3 = row / (p.ne11 * p.ne12); - const uint dst_i2 = (row % (p.ne11 * p.ne12)) / p.ne11; - const uint dst_i1 = row % p.ne11; - const uint dst_idx = dst_i3 * p.nb13 + dst_i2 * p.nb12 + dst_i1 * p.nb11 + col; + const uint src_idx_a = get_aoffset() + i23 * p.nb03 + i22 * p.nb02 + i21 * p.nb01 + i20 * p.nb00; + const uint src_idx_b = get_boffset() + i23 * p.nb13 + i22 * p.nb12 + i21 * p.nb11 + i20 * p.nb10; + const uint dst_idx = get_doffset() + i23 * p.nb23 + i22 * p.nb22 + i21 * p.nb21 + i20 * p.nb20; if (p.mode == 0) { // Default - const uint offset = p.ne00 / 2; - const uint idx = src_idx; + const uint offset = (p.ne00 / 2) * p.nb00; + const uint idx = src_idx_a; data_d[dst_idx] = D_TYPE(op(float(data_a[idx]), float(data_a[idx + offset]))); } else if (p.mode == 1) { // Swapped - const uint offset = p.ne00 / 2; - const uint idx = src_idx; + const uint offset = (p.ne00 / 2) * p.nb00; + const uint idx = src_idx_a; data_d[dst_idx] = D_TYPE(op(float(data_a[idx + offset]), float(data_a[idx]))); } else { // Split - const uint idx = src_idx; - - data_d[dst_idx] = D_TYPE(op(float(data_a[idx]), float(data_b[idx]))); + data_d[dst_idx] = D_TYPE(op(float(data_a[src_idx_a]), float(data_b[src_idx_b]))); } } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp b/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp deleted file mode 100644 index b4dbdf314190..000000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/hardsigmoid.comp +++ /dev/null @@ -1,22 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - - const float x = float(data_a[i]); - data_d[i] = D_TYPE(min(1.0f, max(0.0f, (x + 3.0f) / 6.0f))); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp b/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp deleted file mode 100644 index 1ec315915e8d..000000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/hardswish.comp +++ /dev/null @@ -1,22 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - - const float x = float(data_a[i]); - data_d[i] = D_TYPE(x * min(1.0f, max(0.0f, (x + 3.0f) / 6.0f))); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp index f9af46744df4..9039ed1ded3b 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/l2_norm.comp @@ -14,16 +14,13 @@ void main() { const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x; const uint tid = gl_LocalInvocationID.x; - const uint i3 = row / (p.ne11 * p.ne12); - const uint i3_offset = i3 * p.ne12 * p.ne11; - const uint i2 = (row - i3_offset) / p.ne11; - const uint i2_offset = i2 * p.ne11; - const uint i1 = row - i3_offset - i2_offset; + const uint a_base = get_aoffset() + src0_idx(row * p.ne00); + const uint d_base = get_doffset() + dst_idx(row * p.ne10); sum[tid] = FLOAT_TYPE(0.0f); // partial sum for thread in warp [[unroll]] for (uint i0 = tid; i0 < p.ne00; i0 += BLOCK_SIZE) { - const FLOAT_TYPE xi = FLOAT_TYPE(data_a[i3*p.nb03 + i2*p.nb02 + i1*p.nb01 + i0]); + const FLOAT_TYPE xi = FLOAT_TYPE(data_a[a_base + i0*p.nb00]); sum[tid] += xi * xi; } @@ -39,6 +36,6 @@ void main() { const FLOAT_TYPE scale = 1.0f / max(sqrt(sum[0]), FLOAT_TYPE(p.param1)); [[unroll]] for (uint i0 = tid; i0 < p.ne00; i0 += BLOCK_SIZE) { - data_d[i3*p.nb13 + i2*p.nb12 + i1*p.nb11 + i0] = D_TYPE(scale * FLOAT_TYPE(data_a[i3*p.nb03 + i2*p.nb02 + i1*p.nb01 + i0])); + data_d[d_base + i0*p.nb10] = D_TYPE(scale * FLOAT_TYPE(data_a[a_base + i0*p.nb00])); } } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp b/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp deleted file mode 100644 index b281e855cb25..000000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/leaky_relu.comp +++ /dev/null @@ -1,22 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - - const float val = float(data_a[i]); - data_d[i] = D_TYPE(max(val, 0.0f) + min(val, 0.0f) * p.param1); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp index fd84c3c91d83..7bbee577fb74 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vecq.comp @@ -28,13 +28,10 @@ vec2 cache_b_ds; #include "mul_mat_vecq_funcs.glsl" -void iter(inout FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const uint first_row, const uint num_rows, const uint tid, const uint i) { +void iter(inout FLOAT_TYPE temp[NUM_COLS][NUM_ROWS], const uint first_row, const uint num_rows, const uint col, const uint b_qs_idx) { [[unroll]] for (uint j = 0; j < NUM_COLS; ++j) { - const uint col = i*BLOCK_SIZE + tid*K_PER_ITER; - // Preload data_b block const uint b_block_idx = (j*p.batch_stride_b + col) / QUANT_K_Q8_1 + b_offset; - const uint b_qs_idx = tid % (32 / K_PER_ITER); const uint b_block_idx_outer = b_block_idx / 4; const uint b_block_idx_inner = b_block_idx % 4; cache_b_ds = vec2(data_b[b_block_idx_outer].ds[b_block_idx_inner]); @@ -91,35 +88,35 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { } } - uint num_iters = p.ncols / (K_PER_ITER * BLOCK_SIZE); - if (num_iters * K_PER_ITER * BLOCK_SIZE + K_PER_ITER*tid < p.ncols) { + const uint col_stride = K_PER_ITER * BLOCK_SIZE; + uint num_iters = p.ncols / col_stride; + if (num_iters * col_stride + K_PER_ITER * tid < p.ncols) { num_iters++; } - int unroll_count = 4; - uint unrolled_iters = num_iters & ~(unroll_count - 1); - uint i = 0; - while (i < unrolled_iters) { + const uint b_qs_idx = tid % (32 / K_PER_ITER); + uint col = tid * K_PER_ITER; + while (num_iters >= 4) { // Manually partially unroll the loop - [[unroll]] for (uint k = 0; k < unroll_count; ++k) { - iter(temp, first_row, num_rows, tid, i*K_PER_ITER); - i++; + [[unroll]] for (uint k = 0; k < 4; ++k) { + iter(temp, first_row, num_rows, col, b_qs_idx); + col += col_stride; } - } - unroll_count = 2; - unrolled_iters = num_iters & ~(unroll_count - 1); + num_iters -= 4; + } - while (i < unrolled_iters) { + if (num_iters >= 2) { // Manually partially unroll the loop - [[unroll]] for (uint k = 0; k < unroll_count; ++k) { - iter(temp, first_row, num_rows, tid, i*K_PER_ITER); - i++; - } + iter(temp, first_row, num_rows, col, b_qs_idx); + col += col_stride; + iter(temp, first_row, num_rows, col, b_qs_idx); + col += col_stride; + num_iters -= 2; } - while (i < num_iters) { - iter(temp, first_row, num_rows, tid, i*K_PER_ITER); - i++; + + if (num_iters > 0) { + iter(temp, first_row, num_rows, col, b_qs_idx); } reduce_result(temp, d_offset, first_row, num_rows, tid); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp index f39410d74f07..57c0410e4555 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp @@ -38,17 +38,7 @@ #define LOAD_VEC_B 1 #endif -// Load 2 values at once without affecting index calculations through LOAD_VEC -#if (defined(DATA_A_F32) || defined(DATA_A_F16) || defined(DATA_A_BF16)) && !defined(ALIGNED) -#define LOAD_VEC_BATCH_A 2 -#else -#define LOAD_VEC_BATCH_A 1 -#endif -#if !defined(ALIGNED) -#define LOAD_VEC_BATCH_B 2 -#else -#define LOAD_VEC_BATCH_B 1 -#endif +layout (constant_id = 11) const uint ALIGNED = 0; #if !defined(TO_FLOAT_TYPE) #define TO_FLOAT_TYPE FLOAT_TYPE @@ -57,6 +47,13 @@ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; +#if defined(DATA_A_F32) +layout (binding = 0) readonly buffer A_SCALAR {float data_a_scalar[];}; +#elif defined(DATA_A_F16) +layout (binding = 0) readonly buffer A_SCALAR {float16_t data_a_scalar[];}; +#elif defined(DATA_A_BF16) +layout (binding = 0) readonly buffer A_SCALAR {uint16_t data_a_scalar[];}; +#endif #if defined(A_TYPE_PACKED16) layout (binding = 0) readonly buffer A_PACKED16 {A_TYPE_PACKED16 data_a_packed16[];}; #endif @@ -65,6 +62,7 @@ layout (binding = 0) readonly buffer A_PACKED32 {A_TYPE_PACKED32 data_a_packed32 #endif layout (binding = 1) readonly buffer B {B_TYPE data_b[];}; +layout (binding = 1) readonly buffer B_SCALAR {B_TYPE_SCALAR data_b_scalar[];}; layout (binding = 2) writeonly buffer D {D_TYPE data_d[];}; #ifdef MUL_MAT_ID @@ -194,13 +192,23 @@ void main() { const uint warp_r = warp_i % (BM / WM); const uint warp_c = warp_i / (BM / WM); - const uint loadr_a = gl_LocalInvocationID.x % (BK / LOAD_VEC_A / LOAD_VEC_BATCH_A); - const uint loadc_a = gl_LocalInvocationID.x / (BK / LOAD_VEC_A / LOAD_VEC_BATCH_A); - const uint loadr_b = gl_LocalInvocationID.x % (BK / LOAD_VEC_B / LOAD_VEC_BATCH_B); - const uint loadc_b = gl_LocalInvocationID.x / (BK / LOAD_VEC_B / LOAD_VEC_BATCH_B); +#if defined(DATA_A_F32) || defined(DATA_A_F16) || defined(DATA_A_BF16) + const uint LOAD_VEC_A_EFF = (ALIGNED != 0) ? LOAD_VEC_A : 1; + const uint LOAD_VEC_BATCH_A = (ALIGNED != 0) ? 1 : 2; +#else + const uint LOAD_VEC_A_EFF = LOAD_VEC_A; + const uint LOAD_VEC_BATCH_A = 1; +#endif + const uint LOAD_VEC_B_EFF = (ALIGNED != 0) ? LOAD_VEC_B : 1; + const uint LOAD_VEC_BATCH_B = (ALIGNED != 0) ? 1 : 2; + + const uint loadr_a = gl_LocalInvocationID.x % (BK / LOAD_VEC_A_EFF / LOAD_VEC_BATCH_A); + const uint loadc_a = gl_LocalInvocationID.x / (BK / LOAD_VEC_A_EFF / LOAD_VEC_BATCH_A); + const uint loadr_b = gl_LocalInvocationID.x % (BK / LOAD_VEC_B_EFF / LOAD_VEC_BATCH_B); + const uint loadc_b = gl_LocalInvocationID.x / (BK / LOAD_VEC_B_EFF / LOAD_VEC_BATCH_B); - const uint loadstride_a = gl_WorkGroupSize.x * LOAD_VEC_A * LOAD_VEC_BATCH_A / BK; - const uint loadstride_b = gl_WorkGroupSize.x * LOAD_VEC_B * LOAD_VEC_BATCH_B / BK; + const uint loadstride_a = gl_WorkGroupSize.x * LOAD_VEC_A_EFF * LOAD_VEC_BATCH_A / BK; + const uint loadstride_b = gl_WorkGroupSize.x * LOAD_VEC_B_EFF * LOAD_VEC_BATCH_B / BK; #ifdef MUL_MAT_ID #ifdef MUL_MAT_ID_USE_SUBGROUPS @@ -239,15 +247,15 @@ void main() { uint pos_a = #ifdef MUL_MAT_ID - expert_idx * (p.batch_stride_a / LOAD_VEC_A) + + expert_idx * (p.batch_stride_a / LOAD_VEC_A_EFF) + #else - batch_idx_a * (p.batch_stride_a / LOAD_VEC_A) + + batch_idx_a * (p.batch_stride_a / LOAD_VEC_A_EFF) + #endif - (ir * BM * p.stride_a + start_k) / LOAD_VEC_A; + (ir * BM * p.stride_a + start_k) / LOAD_VEC_A_EFF; #ifdef MUL_MAT_ID uint pos_b = 0; #else - uint pos_b = (batch_idx * p.batch_stride_b + ic * BN * p.stride_b + start_k) / LOAD_VEC_B; + uint pos_b = (batch_idx * p.batch_stride_b + ic * BN * p.stride_b + start_k) / LOAD_VEC_B_EFF; #endif #ifdef COOPMAT @@ -287,8 +295,8 @@ void main() { barrier(); - pos_a += BK / LOAD_VEC_A; - pos_b += BK / LOAD_VEC_B; + pos_a += BK / LOAD_VEC_A_EFF; + pos_b += BK / LOAD_VEC_B_EFF; #ifdef COOPMAT [[unroll]] for (uint i = 0; i < BK; i += TK) { diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp index 2656fe1c3e95..a2e15f6f5ced 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp @@ -36,6 +36,7 @@ layout (constant_id = 3) const uint BK = 16; // Assumed to be 32 if working wit layout (constant_id = 4) const bool enable_smaller_matrices = false; const uint BNover2 = enable_smaller_matrices ? (BN / 2) : BN; const uint BNover4 = enable_smaller_matrices ? (BN / 4) : BN; +layout (constant_id = 5) const uint ALIGNED = 0; layout (push_constant) uniform parameter { @@ -111,7 +112,7 @@ layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufB { }; uint _ne1; -layout (constant_id = 5) const uint subgroup_size = 32; +layout (constant_id = 6) const uint subgroup_size = 32; shared uvec4 ballots_sh[BLOCK_SIZE / subgroup_size]; B_TYPE decodeFuncB(const in decodeBufB bl, const in uint blockCoords[2], const in uint coordInBlock[2]) @@ -297,12 +298,12 @@ void main() { // Hint to the compiler that values are aligned (want 16B alignment). // Quants are always block-aligned, no alignment needed. -#if ALIGNED + if (ALIGNED != 0) { #if QUANT_K == 1 - stride_a &= ~7; -#endif - stride_b &= ~7; + stride_a &= ~7; #endif + stride_b &= ~7; + } // Create layouts for both clamped and unclamped accesses tensorLayoutNV<2> tensorLayoutA = createTensorLayoutNV(2); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl index 73595168984c..56a8a0f187f9 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_funcs.glsl @@ -1,50 +1,57 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uint idx_m, const uint block, const uint end_k) { #if defined(DATA_A_F32) || defined(DATA_A_F16) #if LOAD_VEC_A == 8 - const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row; - const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2; - FLOAT_TYPEV8 aa = FLOAT_TYPEV8(data_a[idx]); - buf_a[buf_idx ] = aa[0].xy; - buf_a[buf_idx + 1] = aa[0].zw; - buf_a[buf_idx + 2] = aa[1].xy; - buf_a[buf_idx + 3] = aa[1].zw; + if (ALIGNED != 0) { + const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row; + const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2; + FLOAT_TYPEV8 aa = FLOAT_TYPEV8(data_a[idx]); + buf_a[buf_idx ] = aa[0].xy; + buf_a[buf_idx + 1] = aa[0].zw; + buf_a[buf_idx + 2] = aa[1].xy; + buf_a[buf_idx + 3] = aa[1].zw; + return; + } #elif LOAD_VEC_A == 4 - const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row; - const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2; - FLOAT_TYPEV4 aa = FLOAT_TYPEV4(data_a[idx]); - buf_a[buf_idx ] = aa.xy; - buf_a[buf_idx + 1] = aa.zw; -#else // LOAD_VEC_BATCH_A == 2 + if (ALIGNED != 0) { + const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row; + const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2; + FLOAT_TYPEV4 aa = FLOAT_TYPEV4(data_a[idx]); + buf_a[buf_idx ] = aa.xy; + buf_a[buf_idx + 1] = aa.zw; + return; + } +#endif const uint idx = pos_a + col * p.stride_a + row * 2; const uint buf_idx = col * SHMEM_STRIDE + row; if (idx_m < p.M && block + row * 2 + 1 < end_k) { - buf_a[buf_idx] = FLOAT_TYPEV2(data_a[idx], - data_a[idx + 1]); + buf_a[buf_idx] = FLOAT_TYPEV2(data_a_scalar[idx], + data_a_scalar[idx + 1]); } else if (idx_m < p.M && block + row * 2 < end_k) { - buf_a[buf_idx] = FLOAT_TYPEV2(data_a[idx], 0.0f); + buf_a[buf_idx] = FLOAT_TYPEV2(data_a_scalar[idx], 0.0f); } else { buf_a[buf_idx] = FLOAT_TYPEV2(0.0f); } -#endif #elif defined(DATA_A_BF16) #if LOAD_VEC_A == 4 - const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row; - const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2; - FLOAT_TYPEV4 aa = FLOAT_TYPEV4(TO_FLOAT_TYPE(data_a[idx])); - buf_a[buf_idx ] = aa.xy; - buf_a[buf_idx + 1] = aa.zw; -#else // LOAD_VEC_BATCH_A == 2 + if (ALIGNED != 0) { + const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row; + const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 2; + FLOAT_TYPEV4 aa = FLOAT_TYPEV4(TO_FLOAT_TYPE(data_a[idx])); + buf_a[buf_idx ] = aa.xy; + buf_a[buf_idx + 1] = aa.zw; + return; + } +#endif const uint idx = pos_a + col * p.stride_a + row * 2; const uint buf_idx = col * SHMEM_STRIDE + row; if (idx_m < p.M && block + row * 2 + 1 < end_k) { - buf_a[buf_idx] = FLOAT_TYPEV2(TO_FLOAT_TYPE(data_a[idx]), - TO_FLOAT_TYPE(data_a[idx + 1])); + buf_a[buf_idx] = FLOAT_TYPEV2(TO_FLOAT_TYPE(data_a_scalar[idx]), + TO_FLOAT_TYPE(data_a_scalar[idx + 1])); } else if (idx_m < p.M && block + row * 2 < end_k) { - buf_a[buf_idx] = FLOAT_TYPEV2(TO_FLOAT_TYPE(data_a[idx]), 0.0f); + buf_a[buf_idx] = FLOAT_TYPEV2(TO_FLOAT_TYPE(data_a_scalar[idx]), 0.0f); } else { buf_a[buf_idx] = FLOAT_TYPEV2(0.0f); } -#endif #elif defined(DATA_A_Q4_0) const uint idx = pos_a + col * p.stride_a / LOAD_VEC_A + row; const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_A / 4; @@ -526,75 +533,85 @@ void load_a_to_shmem(const uint pos_a, const uint row, const uint col, const uin #if !defined(MUL_MAT_ID) void load_b_to_shmem(const uint pos_b, const uint row, const uint col, const uint idx_n, const uint block, const uint end_k) { #if LOAD_VEC_B == 8 - // Not supported for b_type bf16 because bf16mat2x4 does not exist - const uint idx = pos_b + col * p.stride_b / LOAD_VEC_B + row; - const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_B / 2; - FLOAT_TYPEV8 bb = FLOAT_TYPEV8(data_b[idx]); - buf_b[buf_idx + 0] = bb[0].xy; - buf_b[buf_idx + 1] = bb[0].zw; - buf_b[buf_idx + 2] = bb[1].xy; - buf_b[buf_idx + 3] = bb[1].zw; + if (ALIGNED != 0) { + // Not supported for b_type bf16 because bf16mat2x4 does not exist + const uint idx = pos_b + col * p.stride_b / LOAD_VEC_B + row; + const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_B / 2; + FLOAT_TYPEV8 bb = FLOAT_TYPEV8(data_b[idx]); + buf_b[buf_idx + 0] = bb[0].xy; + buf_b[buf_idx + 1] = bb[0].zw; + buf_b[buf_idx + 2] = bb[1].xy; + buf_b[buf_idx + 3] = bb[1].zw; + return; + } #elif LOAD_VEC_B == 4 - const uint idx = pos_b + col * p.stride_b / LOAD_VEC_B + row; - const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_B / 2; + if (ALIGNED != 0) { + const uint idx = pos_b + col * p.stride_b / LOAD_VEC_B + row; + const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_B / 2; #if defined(DATA_B_BF16) - FLOAT_TYPEV4 bb = FLOAT_TYPEV4(TO_FLOAT_TYPE(data_b[idx])); + FLOAT_TYPEV4 bb = FLOAT_TYPEV4(TO_FLOAT_TYPE(data_b[idx])); #else - FLOAT_TYPEV4 bb = FLOAT_TYPEV4(data_b[idx]); + FLOAT_TYPEV4 bb = FLOAT_TYPEV4(data_b[idx]); +#endif + buf_b[buf_idx + 0] = bb.xy; + buf_b[buf_idx + 1] = bb.zw; + return; + } #endif - buf_b[buf_idx + 0] = bb.xy; - buf_b[buf_idx + 1] = bb.zw; -#else // LOAD_VEC_BATCH_B == 2 const uint idx = pos_b + col * p.stride_b + row * 2; const uint buf_idx = col * SHMEM_STRIDE + row; if (idx_n < p.N && block + row * 2 + 1 < end_k) { - buf_b[buf_idx] = FLOAT_TYPEV2(TO_FLOAT_TYPE(data_b[idx]), - TO_FLOAT_TYPE(data_b[idx + 1])); + buf_b[buf_idx] = FLOAT_TYPEV2(TO_FLOAT_TYPE(data_b_scalar[idx]), + TO_FLOAT_TYPE(data_b_scalar[idx + 1])); } else if (idx_n < p.N && block + row * 2 < end_k) { - buf_b[buf_idx] = FLOAT_TYPEV2(TO_FLOAT_TYPE(data_b[idx]), 0.0f); + buf_b[buf_idx] = FLOAT_TYPEV2(TO_FLOAT_TYPE(data_b_scalar[idx]), 0.0f); } else { buf_b[buf_idx] = FLOAT_TYPEV2(0.0f); } -#endif } #else void load_b_to_shmem(const uint pos_b, const uint row, const uint col, const uint ic, const uint _ne1, const uint block, const uint end_k) { #if LOAD_VEC_B == 8 - // Not supported for b_type bf16 because bf16mat2x4 does not exist - const u16vec2 row_idx = row_ids[col]; - const uint idx = pos_b + row_idx.y * p.batch_stride_b / LOAD_VEC_B + (row_idx.x % p.ne11) * p.stride_b / LOAD_VEC_B + row; - const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_B / 2; - FLOAT_TYPEV8 bb = FLOAT_TYPEV8(data_b[idx]); - buf_b[buf_idx + 0] = bb[0].xy; - buf_b[buf_idx + 1] = bb[0].zw; - buf_b[buf_idx + 2] = bb[1].xy; - buf_b[buf_idx + 3] = bb[1].zw; + if (ALIGNED != 0) { + // Not supported for b_type bf16 because bf16mat2x4 does not exist + const u16vec2 row_idx = row_ids[col]; + const uint idx = pos_b + row_idx.y * p.batch_stride_b / LOAD_VEC_B + (row_idx.x % p.ne11) * p.stride_b / LOAD_VEC_B + row; + const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_B / 2; + FLOAT_TYPEV8 bb = FLOAT_TYPEV8(data_b[idx]); + buf_b[buf_idx + 0] = bb[0].xy; + buf_b[buf_idx + 1] = bb[0].zw; + buf_b[buf_idx + 2] = bb[1].xy; + buf_b[buf_idx + 3] = bb[1].zw; + return; + } #elif LOAD_VEC_B == 4 - const u16vec2 row_idx = row_ids[col]; - const uint idx = pos_b + row_idx.y * p.batch_stride_b / LOAD_VEC_B + (row_idx.x % p.ne11) * p.stride_b / LOAD_VEC_B + row; - const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_B / 2; + if (ALIGNED != 0) { + const u16vec2 row_idx = row_ids[col]; + const uint idx = pos_b + row_idx.y * p.batch_stride_b / LOAD_VEC_B + (row_idx.x % p.ne11) * p.stride_b / LOAD_VEC_B + row; + const uint buf_idx = col * SHMEM_STRIDE + row * LOAD_VEC_B / 2; #if defined(DATA_B_BF16) - FLOAT_TYPEV4 bb = FLOAT_TYPEV4(TO_FLOAT_TYPE(data_b[idx])); + FLOAT_TYPEV4 bb = FLOAT_TYPEV4(TO_FLOAT_TYPE(data_b[idx])); #else - FLOAT_TYPEV4 bb = FLOAT_TYPEV4(data_b[idx]); + FLOAT_TYPEV4 bb = FLOAT_TYPEV4(data_b[idx]); +#endif + buf_b[buf_idx + 0] = bb.xy; + buf_b[buf_idx + 1] = bb.zw; + return; + } #endif - buf_b[buf_idx + 0] = bb.xy; - buf_b[buf_idx + 1] = bb.zw; -#else // LOAD_VEC_BATCH_B == 2 const uint row_i = ic * BN + col; const uint buf_idx = col * SHMEM_STRIDE + row; if (row_i < _ne1 && block + row * 2 + 1 < end_k) { const u16vec2 row_idx = row_ids[col]; const uint idx = pos_b + row_idx.y * p.batch_stride_b + (row_idx.x % p.ne11) * p.stride_b + row * 2; - buf_b[buf_idx] = FLOAT_TYPEV2(TO_FLOAT_TYPE(data_b[idx]), - TO_FLOAT_TYPE(data_b[idx + 1])); + buf_b[buf_idx] = FLOAT_TYPEV2(TO_FLOAT_TYPE(data_b_scalar[idx]), + TO_FLOAT_TYPE(data_b_scalar[idx + 1])); } else if (row_i < _ne1 && block + row * 2 < end_k) { const u16vec2 row_idx = row_ids[col]; const uint idx = pos_b + row_idx.y * p.batch_stride_b + (row_idx.x % p.ne11) * p.stride_b + row * 2; - buf_b[buf_idx] = FLOAT_TYPEV2(TO_FLOAT_TYPE(data_b[idx]), 0.0f); + buf_b[buf_idx] = FLOAT_TYPEV2(TO_FLOAT_TYPE(data_b_scalar[idx]), 0.0f); } else { buf_b[buf_idx] = FLOAT_TYPEV2(0.0f); } -#endif } #endif diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp b/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp deleted file mode 100644 index 7f9b1bce99a4..000000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/neg.comp +++ /dev/null @@ -1,20 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - data_d[i] = D_TYPE(-float(data_a[i])); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp index cc3ea0b76060..792012d57e85 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/norm.comp @@ -1,26 +1,26 @@ #version 450 -#include "generic_head.glsl" #include "types.glsl" +#include "generic_unary_head.glsl" #extension GL_EXT_control_flow_attributes : enable #define BLOCK_SIZE 512 layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in; -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - shared vec2 sum[BLOCK_SIZE]; void main() { const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x; const uint tid = gl_LocalInvocationID.x; + const uint a_base = get_aoffset() + src0_idx(row * p.ne00); + const uint d_base = get_doffset() + dst_idx(row * p.ne10); + sum[tid] = vec2(0.0f, 0.0f); - [[unroll]] for (uint col = tid; col < p.KX; col += BLOCK_SIZE) { - const float xi = float(data_a[row*p.KX + col]); + [[unroll]] for (uint i0 = tid; i0 < p.ne00; i0 += BLOCK_SIZE) { + const float xi = float(data_a[a_base + i0*p.nb00]); sum[tid].x += xi; sum[tid].y += xi * xi; } @@ -34,11 +34,11 @@ void main() { barrier(); } - const float mean = sum[0].x / p.KX; - const float var = sum[0].y / p.KX - mean * mean; + const float mean = sum[0].x / p.ne00; + const float var = sum[0].y / p.ne00 - mean * mean; const float inv_std = inversesqrt(var + p.param1); - [[unroll]] for (uint col = tid; col < p.KX; col += BLOCK_SIZE) { - data_d[row*p.KX + col] = D_TYPE((float(data_a[row*p.KX + col]) - mean) * inv_std); + [[unroll]] for (uint i0 = tid; i0 < p.ne00; i0 += BLOCK_SIZE) { + data_d[d_base + i0*p.nb10] = D_TYPE((float(data_a[a_base + i0*p.nb00]) - mean) * inv_std); } } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp b/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp deleted file mode 100644 index 5725cef2366a..000000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/relu.comp +++ /dev/null @@ -1,21 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - - data_d[i] = D_TYPE(max(float(data_a[i]), 0)); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp b/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp index 87df782944a9..10f334d4229f 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/repeat_back.comp @@ -13,11 +13,11 @@ void main() { } // Destination multi-index (inlined dst_idx) - const uint i13 = fastdiv(idx, p.ne1_012mp, p.ne1_012L); + const uint i13 = fastdiv(idx, p.ne1_012mp, fastdiv_L(p.ne1_Ls, 0)); const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10; - const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, p.ne1_01L); + const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, fastdiv_L(p.ne1_Ls, 1)); const uint i12_offset = i12*p.ne11*p.ne10; - const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, p.ne1_0L); + const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, fastdiv_L(p.ne1_Ls, 2)); const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10; const uint d_idx = i13*p.nb13 + i12*p.nb12 + i11*p.nb11 + i10*p.nb10; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp b/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp index 68fbd0c7be4e..dae811ad98ba 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/roll.comp @@ -20,11 +20,11 @@ void main() { return; } - const uint i3 = fastdiv(idx, p.ne1_012mp, p.ne1_012L); + const uint i3 = fastdiv(idx, p.ne1_012mp, fastdiv_L(p.ne1_Ls, 0)); const uint i3_offset = i3 * p.ne12*p.ne11*p.ne10; - const uint i2 = fastdiv(idx - i3_offset, p.ne1_01mp, p.ne1_01L); + const uint i2 = fastdiv(idx - i3_offset, p.ne1_01mp, fastdiv_L(p.ne1_Ls, 1)); const uint i2_offset = i2*p.ne11*p.ne10; - const uint i1 = fastdiv(idx - i3_offset - i2_offset, p.ne1_0mp, p.ne1_0L); + const uint i1 = fastdiv(idx - i3_offset - i2_offset, p.ne1_0mp, fastdiv_L(p.ne1_Ls, 2)); const uint i0 = idx - i3_offset - i2_offset - i1*p.ne10; const uint p1 = floatBitsToUint(p.param1); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/round.comp b/ggml/src/ggml-vulkan/vulkan-shaders/round.comp deleted file mode 100644 index e6155dcbf334..000000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/round.comp +++ /dev/null @@ -1,29 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - - const float x = float(data_a[i]); - float result; - // Round halfway cases away from zero as roundf does. - if (x >= 0.0) { - result = floor(x + 0.5); - } else { - result = ceil(x - 0.5); - } - data_d[i] = D_TYPE(result); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp b/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp deleted file mode 100644 index a9c147bf9ac4..000000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/sgn.comp +++ /dev/null @@ -1,21 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - - data_d[i] = D_TYPE(sign(float(data_a[i]))); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp b/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp deleted file mode 100644 index 32298d43c602..000000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/sigmoid.comp +++ /dev/null @@ -1,20 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - data_d[i] = D_TYPE(1. / (1 + exp(-1. * float(data_a[i])))); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp b/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp deleted file mode 100644 index 7d1cc6f45abb..000000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/silu.comp +++ /dev/null @@ -1,22 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - - const float xi = float(data_a[i]); - data_d[i] = D_TYPE(xi / (1.0f + exp(-xi))); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp b/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp deleted file mode 100644 index 61f17b2f0068..000000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/sin.comp +++ /dev/null @@ -1,17 +0,0 @@ -#version 450 - -#include "types.glsl" -#include "generic_unary_head.glsl" - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -void main() { - const uint idx = get_idx(); - - if (idx >= p.ne) { - return; - } - - const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]); - data_d[get_doffset() + dst_idx(idx)] = D_TYPE(sin(val)); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp b/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp deleted file mode 100644 index 323e3cdea41f..000000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/softplus.comp +++ /dev/null @@ -1,23 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - - const float x = float(data_a[i]); - const float result = (x > 20.0f) ? x : log(1.0f + exp(x)); - data_d[i] = D_TYPE(result); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp b/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp deleted file mode 100644 index 70daad6c5db2..000000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/sqrt.comp +++ /dev/null @@ -1,17 +0,0 @@ -#version 450 - -#include "types.glsl" -#include "generic_unary_head.glsl" - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -void main() { - const uint idx = get_idx(); - - if (idx >= p.ne) { - return; - } - - const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]); - data_d[get_doffset() + dst_idx(idx)] = D_TYPE(sqrt(val)); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/square.comp b/ggml/src/ggml-vulkan/vulkan-shaders/square.comp deleted file mode 100644 index 4eb56afcb1eb..000000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/square.comp +++ /dev/null @@ -1,17 +0,0 @@ -#version 450 - -#include "types.glsl" -#include "generic_unary_head.glsl" - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -void main() { - const uint idx = get_idx(); - - if (idx >= p.ne) { - return; - } - - const FLOAT_TYPE val = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(idx)]); - data_d[get_doffset() + dst_idx(idx)] = D_TYPE(val * val); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/step.comp b/ggml/src/ggml-vulkan/vulkan-shaders/step.comp deleted file mode 100644 index 654a2124e04b..000000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/step.comp +++ /dev/null @@ -1,22 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - - const float x = float(data_a[i]); - data_d[i] = D_TYPE(x >= 0.0f ? 1.0f : 0.0f); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp b/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp deleted file mode 100644 index 7b5eb413bf47..000000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/tanh.comp +++ /dev/null @@ -1,20 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - data_d[i] = D_TYPE(1. - 2. / (exp(2.*float(data_a[i])) + 1.)); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp b/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp index f9b78f96072b..9def5dbc9a06 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/tri.comp @@ -17,11 +17,11 @@ void main() { return; } - const uint i03 = fastdiv(idx, p.ne0_012mp, p.ne0_012L); + const uint i03 = fastdiv(idx, p.ne0_012mp, fastdiv_L(p.ne0_Ls, 0)); const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00; - const uint i02 = fastdiv(idx - i03_offset, p.ne0_01mp, p.ne0_01L); + const uint i02 = fastdiv(idx - i03_offset, p.ne0_01mp, fastdiv_L(p.ne0_Ls, 1)); const uint i02_offset = i02*p.ne01*p.ne00; - const uint i01 = fastdiv(idx - i03_offset - i02_offset, p.ne0_0mp, p.ne0_0L); + const uint i01 = fastdiv(idx - i03_offset - i02_offset, p.ne0_0mp, fastdiv_L(p.ne0_Ls, 2)); const uint i00 = idx - i03_offset - i02_offset - i01*p.ne00; int param = floatBitsToInt(p.param1); diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp b/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp deleted file mode 100644 index cf1b76d3bb0d..000000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/trunc.comp +++ /dev/null @@ -1,22 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - - const float x = float(data_a[i]); - data_d[i] = D_TYPE(trunc(x)); -} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/unary.comp b/ggml/src/ggml-vulkan/vulkan-shaders/unary.comp new file mode 100644 index 000000000000..5ee5275d2782 --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/unary.comp @@ -0,0 +1,168 @@ +#version 450 + +#include "types.glsl" +#include "generic_unary_head.glsl" + +layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; + +float op_abs(float x) { + return abs(x); +} + +float op_sgn(float x) { + return sign(x); +} + +float op_neg(float x) { + return -x; +} + +float op_sqr(float x) { + return x * x; +} + +float op_sqrt(float x) { + return sqrt(x); +} + +float op_sin(float x) { + return sin(x); +} + +float op_cos(float x) { + return cos(x); +} + +float op_clamp(float x) { + return clamp(x, p.param1, p.param2); +} + +float op_leaky_relu(float x) { + return max(x, 0.0f) + min(x, 0.0f) * p.param1; +} + +float op_step(float x) { + return x > 0.0f ? 1.0f : 0.0f; +} + +float op_tanh(float x) { + return 1.0f - 2.0f / (exp(2.0f*x) + 1.0f); +} + +float op_elu(float x) { + return x < 0.0f ? exp(x) - 1.0f : x; +} + +float op_relu(float x) { + return max(x, 0.0f); +} + +float op_sigmoid(float x) { + return 1.0f / (1.0f + exp(-x)); +} + +float op_gelu(float x) { + const float GELU_COEF_A = 0.044715f; + const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; + const float val = SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x); + return 0.5f*x*(2.0f - 2.0f / (exp(2.0f * val) + 1.0f)); +} + +float op_gelu_quick(float x) { + const float GELU_QUICK_COEF = -1.702f; + return x * (1.0f / (1.0f + exp(GELU_QUICK_COEF * x))); +} + +float op_silu(float x) { + return x / (1.0f + exp(-x)); +} + +float op_hardswish(float x) { + return x * min(1.0f, max(0.0f, (x + 3.0f) / 6.0f)); +} + +float op_hardsigmoid(float x) { + return min(1.0f, max(0.0f, (x + 3.0f) / 6.0f)); +} + +float op_exp(float x) { + return exp(x); +} + +float op_expm1(float x) { + // exp(x) - 1 loses many ulps to cancellation near zero. Use a degree-6 + // Taylor expansion for |x| <= 1/4: the omitted x^7/5040 term is < 1.3e-8, + // about 0.5 ulp at expm1(0.25), and a host-side f32 model stays within + // 2 ulps over the interval. The first native exp(x)-1 values outside the + // cutoff are about 1 ulp for +0.25 and 2 ulps for -0.25. + if (abs(x) <= 0.25f) { + return x * (1.0f + x * (0.5f + x * ((1.0f/6.0f) + x * ((1.0f/24.0f) + x * ((1.0f/120.0f) + x * (1.0f/720.0f)))))); + } + return exp(x) - 1.0f; +} + +float op_softplus(float x) { + return (x > 20.0f) ? x : log(1.0f + exp(x)); +} + +float op_gelu_erf(float a) { + // based on Abramowitz and Stegun formula 7.1.26 or similar Hastings' approximation + const float p_erf = 0.3275911f; + const float a1_erf = 0.254829592f; + const float a2_erf = -0.284496736f; + const float a3_erf = 1.421413741f; + const float a4_erf = -1.453152027f; + const float a5_erf = 1.061405429f; + + const float SQRT_2_INV = 0.70710678118654752440084436210484f; + const float a_div_sqr2 = a * SQRT_2_INV; + const float sign_x = sign(a_div_sqr2); + const float x = abs(a_div_sqr2); + const float t = 1.0f / (1.0f + p_erf * x); + const float y = 1.0f - (((((a5_erf * t + a4_erf) * t) + a3_erf) * t + a2_erf) * t + a1_erf) * t * exp(-x * x); + return 0.5f * a * (1.0f + sign_x * y); +} + +float op_xielu(float x) { + const float alpha_n = p.param1; + const float alpha_p = p.param2; + const float beta = p.param3; + const float eps = p.param4; + + if (x > 0.0f) { + return alpha_p * x * x + beta * x; + } + + const float min_x_eps = min(x, eps); + return (op_expm1(min_x_eps) - x) * alpha_n + beta * x; +} + +float op_floor(float x) { + return floor(x); +} + +float op_ceil(float x) { + return ceil(x); +} + +float op_round(float x) { + // Round halfway cases away from zero as roundf does. + return x >= 0.0f ? floor(x + 0.5f) : ceil(x - 0.5f); +} + +float op_trunc(float x) { + return trunc(x); +} + +void main() { + const uint idx = get_idx(); + + if (idx >= p.ne) { + return; + } + + const uint a_idx = get_aoffset() + src0_idx(idx); + const uint d_idx = get_doffset() + dst_idx(idx); + + data_d[d_idx] = D_TYPE(OP(float(data_a[a_idx]))); +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index 7bcb14608148..1925582ffedb 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -34,6 +35,9 @@ std::mutex lock; std::vector> shader_fnames; +// Set when any shader subprocess fails (non-zero exit / stderr / launch failure) so the +// build is stopped instead of silently producing a broken libggml-vulkan. (issue #24393) +static std::atomic compile_failed{false}; std::locale c_locale("C"); std::string GLSLC = "glslc"; @@ -78,7 +82,7 @@ enum MatMulIdType { namespace { -void execute_command(std::vector& command, std::string& stdout_str, std::string& stderr_str) { +int execute_command(std::vector& command, std::string& stdout_str, std::string& stderr_str) { #ifdef _WIN32 HANDLE stdout_read, stdout_write; HANDLE stderr_read, stderr_write; @@ -127,8 +131,11 @@ void execute_command(std::vector& command, std::string& stdout_str, CloseHandle(stdout_read); CloseHandle(stderr_read); WaitForSingleObject(pi.hProcess, INFINITE); + DWORD exit_code = 1; + GetExitCodeProcess(pi.hProcess, &exit_code); CloseHandle(pi.hProcess); CloseHandle(pi.hThread); + return (int)exit_code; #else int stdout_pipe[2]; int stderr_pipe[2]; @@ -175,7 +182,9 @@ void execute_command(std::vector& command, std::string& stdout_str, close(stdout_pipe[0]); close(stderr_pipe[0]); - waitpid(pid, nullptr, 0); + int status = 0; + waitpid(pid, &status, 0); + return WIFEXITED(status) ? WEXITSTATUS(status) : -1; } #endif } @@ -372,13 +381,14 @@ void string_to_spv_func(std::string name, std::string in_path, std::string out_p // } // std::cout << std::endl; - execute_command(cmd, stdout_str, stderr_str); - if (!stderr_str.empty()) { - std::cerr << "cannot compile " << name << "\n\n"; + int exit_code = execute_command(cmd, stdout_str, stderr_str); + if (exit_code != 0 || !stderr_str.empty()) { + std::cerr << "cannot compile " << name << " (exit code " << exit_code << ")\n\n"; for (const auto& part : cmd) { std::cerr << part << " "; } std::cerr << "\n\n" << stderr_str << std::endl; + compile_failed = true; return; } @@ -398,6 +408,7 @@ void string_to_spv_func(std::string name, std::string in_path, std::string out_p shader_fnames.push_back(std::make_pair(name, out_path)); } catch (const std::exception& e) { std::cerr << "Error executing command for " << name << ": " << e.what() << std::endl; + compile_failed = true; } } @@ -407,7 +418,7 @@ std::map merge_maps(const std::map> compiles; +static std::deque> compiles; void string_to_spv(std::string name, const std::string& source, const std::map& defines, bool fp16 = true, bool coopmat = false, bool coopmat2 = false, bool f16acc = false, const std::string& suffix = "") { name = name + (f16acc ? "_f16acc" : "") + (coopmat ? "_cm1" : "") + (coopmat2 ? "_cm2" : (fp16 ? "" : "_fp32")) + suffix; std::string out_path = join_paths(output_dir, name + ".spv"); @@ -426,6 +437,11 @@ void string_to_spv(std::string name, const std::string& source, const std::map defines = { + {"A_TYPE", a_f16 ? "float16_t" : "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, + {"UNROLL", unroll ? "[[unroll]]" : ""}, + }; + std::string name = std::string("conv3d") + (a_f16 ? "_f16" : "") + "_f32"; + string_to_spv(name + (unroll ? "_unroll" : ""), "conv3d_mm.comp", defines); +#if defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) + if (unroll) { + auto cm2_defines = defines; + cm2_defines["COOPMAT2"] = "1"; + string_to_spv(name, "conv3d_mm.comp", cm2_defines, true, false, true); + } +#endif +#if defined(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT) + if (unroll) { + auto cm1_defines = defines; + cm1_defines["COOPMAT"] = "1"; + string_to_spv(name, "conv3d_mm.comp", cm1_defines, true, true, false); + } +#endif + } + } + string_to_spv("conv2d_dw_whcn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"WHCN", "1"}})); string_to_spv("conv2d_dw_cwhn_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"CWHN", "1"}})); string_to_spv("conv2d_dw_whcn_f16_f32", "conv2d_dw.comp", merge_maps(base_dict, {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"WHCN", "1"}})); @@ -1240,6 +1282,11 @@ int main(int argc, char** argv) { process_shaders(); + if (compile_failed) { + std::cerr << "vulkan-shaders-gen: one or more shaders failed to compile" << std::endl; + return EXIT_FAILURE; + } + write_output_files(); return EXIT_SUCCESS; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp b/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp deleted file mode 100644 index 35d463bfe442..000000000000 --- a/ggml/src/ggml-vulkan/vulkan-shaders/xielu.comp +++ /dev/null @@ -1,35 +0,0 @@ -#version 450 - -#include "generic_head.glsl" -#include "types.glsl" - -#extension GL_EXT_control_flow_attributes : enable - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; - -void main() { - const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; - - if (i >= p.KX) { - return; - } - - float x = float(data_a[i]); - - float alpha_n = p.param1; - float alpha_p = p.param2; - float beta = p.param3; - float eps = p.param4; - - if (x > 0.0f) { - x = alpha_p * x * x + beta * x; - } else { - const float min_x_eps = min(x, eps); - x = (exp(min_x_eps) - 1 - x) * alpha_n + beta * x; - } - - data_d[i] = D_TYPE(x); -} diff --git a/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp b/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp index 6f877f15ce9e..c00a2e9ee9b8 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp @@ -905,11 +905,12 @@ struct ggml_webgpu_mul_mat_vec_pipeline_key { ggml_type src0_type; ggml_type src1_type; int vectorized; + uint32_t num_cols; bool use_mmvq; bool operator==(const ggml_webgpu_mul_mat_vec_pipeline_key & other) const { return src0_type == other.src0_type && src1_type == other.src1_type && vectorized == other.vectorized && - use_mmvq == other.use_mmvq; + num_cols == other.num_cols && use_mmvq == other.use_mmvq; } }; @@ -919,6 +920,7 @@ struct ggml_webgpu_mul_mat_vec_pipeline_key_hash { ggml_webgpu_hash_combine(seed, key.src0_type); ggml_webgpu_hash_combine(seed, key.src1_type); ggml_webgpu_hash_combine(seed, key.vectorized); + ggml_webgpu_hash_combine(seed, key.num_cols); ggml_webgpu_hash_combine(seed, key.use_mmvq); return seed; } @@ -993,11 +995,12 @@ struct ggml_webgpu_mul_mat_id_pipeline_key { ggml_type src0_type; ggml_type src1_type; uint32_t n_experts; + uint32_t num_cols; int vectorized; bool operator==(const ggml_webgpu_mul_mat_id_pipeline_key & other) const { return src0_type == other.src0_type && src1_type == other.src1_type && n_experts == other.n_experts && - vectorized == other.vectorized; + num_cols == other.num_cols && vectorized == other.vectorized; } }; @@ -1007,6 +1010,7 @@ struct ggml_webgpu_mul_mat_id_pipeline_key_hash { ggml_webgpu_hash_combine(seed, key.src0_type); ggml_webgpu_hash_combine(seed, key.src1_type); ggml_webgpu_hash_combine(seed, key.n_experts); + ggml_webgpu_hash_combine(seed, key.num_cols); ggml_webgpu_hash_combine(seed, key.vectorized); return seed; } @@ -1107,7 +1111,7 @@ inline bool ggml_webgpu_can_use_mmvq(const ggml_tensor * src0, const ggml_tensor * src1, bool supports_dot_product, const std::string & vendor) { - if (src1->ne[1] == 1) { + if (src1->ne[1] <= 4) { bool supports_dp4a = vendor == "amd" || vendor == "intel" || vendor == "nvidia"; if (supports_dp4a && supports_dot_product) { switch (src1->type) { @@ -1889,6 +1893,7 @@ class ggml_webgpu_shader_lib { (context.src0->type == GGML_TYPE_F32 || context.src0->type == GGML_TYPE_F16)) ? 1 : 0; + key.num_cols = context.dst->ne[1]; key.use_mmvq = ggml_webgpu_can_use_mmvq(context.src0, context.src1, context.supports_dot_product, context.vendor); @@ -2004,6 +2009,7 @@ class ggml_webgpu_shader_lib { if (key.vectorized) { variant += "_vectorized"; } + defines.push_back(std::string("NUM_COLS=") + std::to_string(key.num_cols)); auto processed = preprocessor.preprocess(shader_src, defines); auto decisions = std::make_shared(); @@ -2421,6 +2427,7 @@ class ggml_webgpu_shader_lib { if (key.vectorized) { variant += "_vectorized"; } + defines.push_back(std::string("NUM_COLS=1")); defines.push_back(std::string("N_EXPERTS=") + std::to_string(key.n_experts)); diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index 0b605fa86baf..f0ec18abd9af 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -1418,15 +1418,17 @@ static void ggml_webgpu_quantize_q8_dispatch(webgpu_context & const size_t dst_offset = ggml_webgpu_tensor_offset(dst); const size_t q8_src1_align_offset = ROUNDUP_POW2( dst_offset + ggml_nbytes(dst), ctx->global_ctx->capabilities.limits.minStorageBufferOffsetAlignment); - const size_t q8_src1_binding_size = - ROUNDUP_POW2(src1->ne[3] * src1->ne[2] * (36 /* sizeof(q8_1) */ * (src1->ne[0] / /* block_size */ 32)), - WEBGPU_STORAGE_BUF_BINDING_MULT); + const size_t q8_src1_binding_size = ROUNDUP_POW2( + src1->ne[3] * src1->ne[2] * src1->ne[1] * (36 /* sizeof(q8_1) */ * (src1->ne[0] / /* block_size */ 32)), + WEBGPU_STORAGE_BUF_BINDING_MULT); std::vector q8_params = { (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type)), + (uint32_t) (src1->nb[1] / ggml_type_size(src1->type)), (uint32_t) (src1->nb[2] / ggml_type_size(src1->type)), (uint32_t) (src1->nb[3] / ggml_type_size(src1->type)), (uint32_t) src1->ne[0], + (uint32_t) src1->ne[1], (uint32_t) src1->ne[2], (uint32_t) src1->ne[3], }; @@ -1442,7 +1444,7 @@ static void ggml_webgpu_quantize_q8_dispatch(webgpu_context & uint32_t q8_wg_x = 1; uint32_t q8_wg_y = 1; const uint32_t wg_per_vec = (src0->ne[0] / 4 + (q8_wg_size - 1)) / q8_wg_size; - const uint32_t q8_total_wg = src1->ne[2] * src1->ne[3] * wg_per_vec; + const uint32_t q8_total_wg = src1->ne[1] * src1->ne[2] * src1->ne[3] * wg_per_vec; const uint32_t max_wg_per_dim = ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension; compute_2d_workgroups(q8_total_wg, max_wg_per_dim, q8_wg_x, q8_wg_y); @@ -1456,7 +1458,7 @@ static webgpu_encoded_op ggml_webgpu_mul_mat(webgpu_context & ctx, ggml_tensor * src1, ggml_tensor * dst) { // Determine if this is a mat-vec operation - bool is_vec = (dst->ne[1] == 1); + bool use_mat_vec = (dst->ne[1] <= 4); // use MMVQ path for mat-vec bool use_mmvq = ggml_webgpu_can_use_mmvq(src0, src1, ctx->global_ctx->capabilities.supports_dot_product, @@ -1482,7 +1484,7 @@ static webgpu_encoded_op ggml_webgpu_mul_mat(webgpu_context & ctx, webgpu_pipeline pipeline; std::vector dispatches; - if (is_vec) { + if (use_mat_vec) { if (use_mmvq) { ggml_webgpu_quantize_q8_dispatch(ctx, src0, src1, dst, dispatches); } @@ -1529,7 +1531,7 @@ static webgpu_encoded_op ggml_webgpu_mul_mat(webgpu_context & ctx, uint32_t wg_y = 1; const uint32_t max_wg_per_dim = ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension; - if (is_vec) { + if (use_mat_vec) { auto * decisions = static_cast(pipeline.context.get()); uint32_t batches = dst->ne[2] * dst->ne[3]; @@ -3691,8 +3693,8 @@ static size_t ggml_backend_webgpu_buffer_type_get_alloc_size(ggml_backend_buffer ggml_webgpu_can_use_mmvq(src0, src1, ctx->webgpu_global_ctx->capabilities.supports_dot_product, ctx->webgpu_global_ctx->vendor); if (use_mmvq) { - const size_t q8_src1_size = - src1->ne[3] * src1->ne[2] * (36 /* sizeof(q8_1) */ * (src1->ne[0] / /* block_size */ 32)); + const size_t q8_src1_size = src1->ne[3] * src1->ne[2] * src1->ne[1] * + (36 /* sizeof(q8_1) */ * (src1->ne[0] / /* block_size */ 32)); res = ROUNDUP_POW2(res + q8_src1_size + ctx->webgpu_global_ctx->capabilities.limits.minStorageBufferOffsetAlignment, WEBGPU_STORAGE_BUF_BINDING_MULT); @@ -3788,7 +3790,7 @@ static void ggml_webgpu_init_memset_pipeline(webgpu_global_context & ctx) { ctx->memset_pipeline = ggml_webgpu_create_pipeline(ctx->device, wgsl_memset, "memset", constants); } -static void create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) { +static void ggml_backend_webgpu_request_adapter(wgpu::Instance & instance, wgpu::Adapter & adapter) { wgpu::RequestAdapterOptions options = {}; #ifndef __EMSCRIPTEN__ @@ -3800,17 +3802,20 @@ static void create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) { options.nextInChain = &adapterTogglesDesc; #endif - ctx->webgpu_global_ctx->instance.WaitAny( - ctx->webgpu_global_ctx->instance.RequestAdapter( - &options, wgpu::CallbackMode::AllowSpontaneous, - [&ctx](wgpu::RequestAdapterStatus status, wgpu::Adapter adapter, const char * message) { - if (status != wgpu::RequestAdapterStatus::Success) { - GGML_LOG_ERROR("ggml_webgpu: Failed to get an adapter: %s\n", message); - return; - } - ctx->webgpu_global_ctx->adapter = std::move(adapter); - }), - UINT64_MAX); + instance.WaitAny(instance.RequestAdapter( + &options, wgpu::CallbackMode::AllowSpontaneous, + [&adapter](wgpu::RequestAdapterStatus status, wgpu::Adapter _adapter, const char * message) { + if (status != wgpu::RequestAdapterStatus::Success) { + GGML_LOG_ERROR("ggml_webgpu: Failed to get an adapter: %s\n", message); + return; + } + adapter = std::move(_adapter); + }), + UINT64_MAX); +} + +static void create_webgpu_device(ggml_backend_webgpu_reg_context * ctx) { + ggml_backend_webgpu_request_adapter(ctx->webgpu_global_ctx->instance, ctx->webgpu_global_ctx->adapter); GGML_ASSERT(ctx->webgpu_global_ctx->adapter != nullptr); ctx->webgpu_global_ctx->adapter.GetLimits(&ctx->webgpu_global_ctx->capabilities.limits); @@ -4265,7 +4270,7 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const case GGML_OP_RMS_NORM: case GGML_OP_NORM: case GGML_OP_L2_NORM: - supports_op = op->type == GGML_TYPE_F32 && src0->type == GGML_TYPE_F32; + supports_op = (op->type == GGML_TYPE_F32 && src0->type == GGML_TYPE_F32) && ggml_is_contiguous_rows(src0); break; case GGML_OP_ROPE: supports_op = op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16; @@ -4543,20 +4548,7 @@ ggml_backend_reg_t ggml_backend_webgpu_reg() { // Probe for adapter support wgpu::Adapter adapter; if (ctx->webgpu_global_ctx->instance != nullptr) { - wgpu::RequestAdapterOptions options = {}; - - // probe for adapter support - ctx->webgpu_global_ctx->instance.WaitAny( - ctx->webgpu_global_ctx->instance.RequestAdapter( - &options, wgpu::CallbackMode::AllowSpontaneous, - [&adapter](wgpu::RequestAdapterStatus status, wgpu::Adapter _adapter, const char * message) { - if (status != wgpu::RequestAdapterStatus::Success) { - GGML_LOG_ERROR("ggml_webgpu: Failed to get an adapter: %s\n", message); - return; - } - adapter = std::move(_adapter); - }), - UINT64_MAX); + ggml_backend_webgpu_request_adapter(ctx->webgpu_global_ctx->instance, adapter); } // WebGPU backend requires f16 support and, on native, implicit device synchronization. diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl index ed4a6b13bbfb..6a2eb8c824e1 100644 --- a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl +++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_decls.tmpl @@ -98,6 +98,7 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3 } #endif // INIT_SRC0_SHMEM_Q1_0 +// legacy-quants #if defined(INIT_SRC0_SHMEM_Q4_0) || defined(INIT_SRC0_SHMEM_Q4_1) || defined(INIT_SRC0_SHMEM_Q5_0) || defined(INIT_SRC0_SHMEM_Q5_1) || defined(INIT_SRC0_SHMEM_Q8_0) || defined(INIT_SRC0_SHMEM_Q8_1) || defined(INIT_SRC0_SHMEM_MXFP4) const BLOCK_SIZE = 32u; // the number of blocks per k-tile. Note that this currently only works if TILE_K is a multiple of BLOCK_SIZE, which may need to be rethought for larger quantized types. @@ -124,7 +125,7 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3 if (global_m < params.m && global_block_k < params.k / BLOCK_SIZE) { let src0_idx = batch_offset + global_m * params.stride_01 + global_block_k; -#ifdef INIT_SRC0_SHMEM_Q4_0 +#if defined(INIT_SRC0_SHMEM_Q4_0) let block_byte_base = src0_idx * 18u; // BLOCK_SIZE_BYTES = 18u; let d = load_f16_at_src0(block_byte_base); @@ -134,7 +135,9 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3 let q_packed = load_u32_at_src0(q_byte_offset); dequant_q4_0_packed_to_shmem(q_packed, d, shmem_idx + j * BYTES_PER_INNER_LOOP); } -#elif INIT_SRC0_SHMEM_Q4_1 +#endif // INIT_SRC0_SHMEM_Q4_0 + +#if defined(INIT_SRC0_SHMEM_Q4_1) let block_byte_base = src0_idx * 20u; // BLOCK_SIZE_BYTES = 20u; let dm = unpack2x16float(load_u32_at_src0_aligned(block_byte_base)); let d = f16(dm[0]); @@ -153,7 +156,9 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3 shmem[shmem_idx + j * BYTES_PER_INNER_LOOP + k + 16u] = q_hi; } } -#elif INIT_SRC0_SHMEM_Q5_0 +#endif // INIT_SRC0_SHMEM_Q4_1 + +#if defined(INIT_SRC0_SHMEM_Q5_0) let block_byte_base = src0_idx * 22u; // BLOCK_SIZE_BYTES = 22u; let d = load_f16_at_src0(block_byte_base); @@ -176,7 +181,9 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3 shmem[shmem_idx + j * BYTES_PER_INNER_LOOP + k + 16u] = q_hi; } } -#elif INIT_SRC0_SHMEM_Q5_1 +#endif // INIT_SRC0_SHMEM_Q5_0 + +#if defined(INIT_SRC0_SHMEM_Q5_1) let block_byte_base = src0_idx * 24u; // BLOCK_SIZE_BYTES = 24u; let dm = unpack2x16float(load_u32_at_src0_aligned(block_byte_base)); @@ -201,7 +208,9 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3 shmem[shmem_idx + j * BYTES_PER_INNER_LOOP + k + 16u] = q_hi; } } -#elif INIT_SRC0_SHMEM_Q8_0 +#endif // INIT_SRC0_SHMEM_Q5_1 + +#if defined(INIT_SRC0_SHMEM_Q8_0) let block_byte_base = src0_idx * 34u; // BLOCK_SIZE_BYTES = 34u; let d = load_f16_at_src0(block_byte_base); @@ -211,7 +220,9 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3 let q_packed = load_u32_at_src0(q_byte_offset); dequant_q8_0_packed_to_shmem(q_packed, d, shmem_idx + j * BYTES_PER_INNER_LOOP); } -#elif INIT_SRC0_SHMEM_Q8_1 +#endif // INIT_SRC0_SHMEM_Q8_0 + +#if defined(INIT_SRC0_SHMEM_Q8_1) let block_byte_base = src0_idx * 36u; // BLOCK_SIZE_BYTES = 36u; let dm = unpack2x16float(load_u32_at_src0_aligned(block_byte_base)); let d = f16(dm[0]); @@ -227,7 +238,9 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3 shmem[shmem_idx + j * BYTES_PER_INNER_LOOP + k] = q_val; } } -#elif INIT_SRC0_SHMEM_MXFP4 +#endif // INIT_SRC0_SHMEM_Q8_1 + +#if defined(INIT_SRC0_SHMEM_MXFP4) let block_byte_base = src0_idx * 17u; let eu8 = get_byte(load_u32_at_src0_aligned(block_byte_base), block_byte_base & 3u); let e = ldexp(1.0, i32(eu8) - 128); @@ -244,11 +257,11 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3 shmem[shmem_idx + j * BYTES_PER_INNER_LOOP + k + 16u] = f16(q_hi); } } -#endif +#endif // INIT_SRC0_SHMEM_MXFP4 } } } -#endif +#endif // legacy-quants // k-quants #if defined(INIT_SRC0_SHMEM_Q2_K) || defined(INIT_SRC0_SHMEM_Q3_K) || defined(INIT_SRC0_SHMEM_Q4_K) || defined(INIT_SRC0_SHMEM_Q5_K) || defined(INIT_SRC0_SHMEM_Q6_K) @@ -284,7 +297,7 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3 let src0_idx = batch_offset + global_m * params.stride_01 + block_k; -#ifdef INIT_SRC0_SHMEM_Q2_K +#if defined(INIT_SRC0_SHMEM_Q2_K) let block_byte_base = src0_idx * 84u; // BLOCK_SIZE_BYTES = 84u; let scales_byte_base = block_byte_base; let qs_byte_base = block_byte_base + 16u; @@ -314,7 +327,9 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3 let ml = dmin * f16(scale >> 4u); store_shmem_kquants(qs_vec4 * dl - ml, elem_idx); -#elif INIT_SRC0_SHMEM_Q3_K +#endif // INIT_SRC0_SHMEM_Q2_K + +#if defined(INIT_SRC0_SHMEM_Q3_K) let block_byte_base = src0_idx * 110u; // BLOCK_SIZE_BYTES = 110u; let hmask_byte_base = block_byte_base + 0u; let qs_byte_base = block_byte_base + 32u; @@ -355,7 +370,9 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3 let dl = d_all * (f16((scale_hi2 << 4u) | scale_low4) - 32.0); store_shmem_kquants(dl * q_vec4, elem_idx); -#elif INIT_SRC0_SHMEM_Q4_K +#endif // INIT_SRC0_SHMEM_Q3_K + +#if defined(INIT_SRC0_SHMEM_Q4_K) let block_byte_base = src0_idx * 144u; // BLOCK_SIZE_BYTES = 144u; let dm_byte_base = block_byte_base + 0u; let scale_byte_base = block_byte_base + 4u; @@ -399,7 +416,9 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3 let ml = dmin * f16(mn); store_shmem_kquants(dl * qs_vec4 - vec4(ml, ml, ml, ml), elem_idx); -#elif INIT_SRC0_SHMEM_Q5_K +#endif // INIT_SRC0_SHMEM_Q4_K + +#if defined(INIT_SRC0_SHMEM_Q5_K) let block_byte_base = src0_idx * 176u; // BLOCK_SIZE_BYTES = 176u; let dm_byte_base = block_byte_base + 0u; let scale_byte_base = block_byte_base + 4u; @@ -456,7 +475,9 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3 let ml = dmin * f16(mn); store_shmem_kquants((qh_vec4 + qs_lo4_vec4) * dl - vec4(ml, ml, ml, ml), elem_idx); -#elif INIT_SRC0_SHMEM_Q6_K +#endif // INIT_SRC0_SHMEM_Q5_K + +#if defined(INIT_SRC0_SHMEM_Q6_K) let block_byte_base = src0_idx * 210u; // BLOCK_SIZE_BYTES = 210u; let ql_byte_base = block_byte_base; let qh_byte_base = block_byte_base + 128u; @@ -497,17 +518,18 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3 let scale = get_byte_i32(scale_word, scale_byte & 3u); store_shmem_kquants(d * q_vec4 * f16(scale), elem_idx); -#endif +#endif // INIT_SRC0_SHMEM_Q6_K } } #endif // k-quants -#ifdef INIT_SRC0_SHMEM_IQ4_NL +#if defined(INIT_SRC0_SHMEM_IQ4_NL) const BLOCK_SIZE = 32u; const BLOCK_SIZE_BYTES = 18u; +const NQ = 4u; fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u32) { - for (var elem_idx = thread_id; elem_idx < TILE_SRC0_SHMEM; elem_idx += TOTAL_WORKGROUP_SIZE) { + for (var elem_idx = thread_id * NQ; elem_idx < TILE_SRC0_SHMEM; elem_idx += NQ * TOTAL_WORKGROUP_SIZE) { let tile_m = elem_idx / TILE_K; let tile_k = elem_idx % TILE_K; let global_m = offset_m + tile_m; @@ -519,408 +541,464 @@ fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u3 } let block_k = global_k / BLOCK_SIZE; - let k_in_block = global_k % BLOCK_SIZE; + let k_in_block = global_k % BLOCK_SIZE; // k_in_block % 4 == 0; + + let src0_idx = batch_offset + global_m * params.stride_01 + block_k; - let src0_idx = batch_offset + global_m * params.stride_01 + block_k; let block_byte_base = src0_idx * BLOCK_SIZE_BYTES; - let d = load_f16_at_src0(block_byte_base); + let d_byte_base = block_byte_base + 0u; + let qs_byte_base = block_byte_base + 2u; - let pos = k_in_block % 16u; - let nib_shift = (k_in_block / 16u) * 4u; - let q_packed = load_u32_at_src0(block_byte_base + 2u + (pos / 4u) * 4u); - let nib = (get_byte(q_packed, pos % 4u) >> nib_shift) & 0xFu; + let d = load_f16_at_src0(d_byte_base); - shmem[elem_idx] = d * f16(kvalues_iq4nl[nib]); + let id_qtr = (k_in_block % 16u) / 4u; + let shift_phase = k_in_block / 16u; + + let qs_u32 = load_u32_at_src0(qs_byte_base + 4u * id_qtr); + + shmem[elem_idx + 0u] = d * f16(kvalues_iq4nl[(qs_u32 >> ( 0u + 4u * shift_phase)) & 0xFu]); + shmem[elem_idx + 1u] = d * f16(kvalues_iq4nl[(qs_u32 >> ( 8u + 4u * shift_phase)) & 0xFu]); + shmem[elem_idx + 2u] = d * f16(kvalues_iq4nl[(qs_u32 >> (16u + 4u * shift_phase)) & 0xFu]); + shmem[elem_idx + 3u] = d * f16(kvalues_iq4nl[(qs_u32 >> (24u + 4u * shift_phase)) & 0xFu]); } } #endif // INIT_SRC0_SHMEM_IQ4_NL -#ifdef INIT_SRC0_SHMEM_IQ4_XS +// i-quants (super block size: 256) +#if defined(INIT_SRC0_SHMEM_IQ4_XS) || defined(INIT_SRC0_SHMEM_IQ1_S) || defined(INIT_SRC0_SHMEM_IQ1_M) || defined(INIT_SRC0_SHMEM_IQ2_XXS) \ +|| defined(INIT_SRC0_SHMEM_IQ2_XS) || defined(INIT_SRC0_SHMEM_IQ2_S) || defined(INIT_SRC0_SHMEM_IQ3_XXS) || defined(INIT_SRC0_SHMEM_IQ3_S) const BLOCK_SIZE = 256u; -const BLOCK_SIZE_BYTES = 136u; +const NQ = 16u; -fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u32) { - for (var elem_idx = thread_id; elem_idx < TILE_SRC0_SHMEM; elem_idx += TOTAL_WORKGROUP_SIZE) { - let tile_m = elem_idx / TILE_K; - let tile_k = elem_idx % TILE_K; - let global_m = offset_m + tile_m; - let global_k = k_outer + tile_k; +fn store_shmem_iquants(val: vec4, idx: u32) { + shmem[idx] = val.x; + shmem[idx + 1] = val.y; + shmem[idx + 2] = val.z; + shmem[idx + 3] = val.w; +} - if (global_m >= params.m || global_k >= params.k) { - shmem[elem_idx] = f16(0.0); - continue; - } +fn load_byte_at_src0_aligned(byte_offset: u32) -> u32 { + return get_byte(load_u32_at_src0_aligned(byte_offset), byte_offset % 4u); +} - let block_k = global_k / BLOCK_SIZE; - let k_in_block = global_k % BLOCK_SIZE; +#if defined(INIT_SRC0_SHMEM_IQ1_M) || defined(INIT_SRC0_SHMEM_IQ1_S) +fn create_iq_gw4(dl: f32, gw: u32, shift_base: u32, delta: f32) -> vec4 { + return vec4( + f16(dl * (f32((bitcast(((gw >> (shift_base + 0u)) & 3u) << 30u) >> 30u)) + delta)), + f16(dl * (f32((bitcast(((gw >> (shift_base + 2u)) & 3u) << 30u) >> 30u)) + delta)), + f16(dl * (f32((bitcast(((gw >> (shift_base + 4u)) & 3u) << 30u) >> 30u)) + delta)), + f16(dl * (f32((bitcast(((gw >> (shift_base + 6u)) & 3u) << 30u) >> 30u)) + delta)), + ); +} +#endif - let src0_idx = batch_offset + global_m * params.stride_01 + block_k; - let block_byte_base = src0_idx * BLOCK_SIZE_BYTES; +#if defined(INIT_SRC0_SHMEM_IQ4_XS) +fn create_iq_gw4(dl: f16, qs_u32: u32, shift_phase: u32) -> vec4 { + return vec4( + dl * f16(kvalues_iq4nl[(qs_u32 >> (4 * shift_phase + 0u)) & 0xFu]), + dl * f16(kvalues_iq4nl[(qs_u32 >> (4 * shift_phase + 8u)) & 0xFu]), + dl * f16(kvalues_iq4nl[(qs_u32 >> (4 * shift_phase + 16u)) & 0xFu]), + dl * f16(kvalues_iq4nl[(qs_u32 >> (4 * shift_phase + 24u)) & 0xFu]), + ); +} +#endif - let d_scales_h = load_u32_at_src0(block_byte_base); - let d = bitcast>(d_scales_h).x; - let scales_h = d_scales_h >> 16u; +#if defined(INIT_SRC0_SHMEM_IQ2_XXS) +fn create_iq_gw4(ig: u32, grid_phase: u32) -> vec4 { + return vec4( + f32(get_byte(iq2xxs_grid[(ig + grid_phase + 0u) / 4u], (ig + grid_phase + 0u) % 4u)), + f32(get_byte(iq2xxs_grid[(ig + grid_phase + 1u) / 4u], (ig + grid_phase + 1u) % 4u)), + f32(get_byte(iq2xxs_grid[(ig + grid_phase + 2u) / 4u], (ig + grid_phase + 2u) % 4u)), + f32(get_byte(iq2xxs_grid[(ig + grid_phase + 3u) / 4u], (ig + grid_phase + 3u) % 4u)), + ); +} +#endif - let ib = k_in_block / 32u; - let pos = k_in_block % 32u; +#if defined(INIT_SRC0_SHMEM_IQ2_XS) +fn create_iq_gw4(ig: u32, grid_phase: u32) -> vec4 { + return vec4( + f32(get_byte(iq2xs_grid[(ig + grid_phase + 0u) / 4u], (ig + grid_phase + 0u) % 4u)), + f32(get_byte(iq2xs_grid[(ig + grid_phase + 1u) / 4u], (ig + grid_phase + 1u) % 4u)), + f32(get_byte(iq2xs_grid[(ig + grid_phase + 2u) / 4u], (ig + grid_phase + 2u) % 4u)), + f32(get_byte(iq2xs_grid[(ig + grid_phase + 3u) / 4u], (ig + grid_phase + 3u) % 4u)), + ); +} +#endif - let scales_l_word = load_u32_at_src0(block_byte_base + 4u); - let ls_lo = (get_byte(scales_l_word, ib / 2u) >> ((ib & 1u) * 4u)) & 0xFu; - let ls_hi = ((scales_h >> (2u * ib)) & 3u) << 4u; - let dl = d * f16(i32(ls_lo | ls_hi) - 32); +#if defined(INIT_SRC0_SHMEM_IQ2_S) +fn create_iq_gw4(ig: u32, grid_phase: u32) -> vec4 { + return vec4( + f32(get_byte(iq2s_grid[(ig + grid_phase + 0u) / 4u], (ig + grid_phase + 0u) % 4u)), + f32(get_byte(iq2s_grid[(ig + grid_phase + 1u) / 4u], (ig + grid_phase + 1u) % 4u)), + f32(get_byte(iq2s_grid[(ig + grid_phase + 2u) / 4u], (ig + grid_phase + 2u) % 4u)), + f32(get_byte(iq2s_grid[(ig + grid_phase + 3u) / 4u], (ig + grid_phase + 3u) % 4u)), + ); +} +#endif - let iqs = ib * 16u + (pos % 16u); - let nib_shift = (pos / 16u) * 4u; - let q_packed = load_u32_at_src0(block_byte_base + 8u + (iqs / 4u) * 4u); - let nib = (get_byte(q_packed, iqs % 4u) >> nib_shift) & 0xFu; +#if defined(INIT_SRC0_SHMEM_IQ3_XXS) +fn create_iq_gw4(ig: u32) -> vec4 { + return vec4( + f32(get_byte(iq3xxs_grid[ig], 0)), + f32(get_byte(iq3xxs_grid[ig], 1)), + f32(get_byte(iq3xxs_grid[ig], 2)), + f32(get_byte(iq3xxs_grid[ig], 3)), + ); +} +#endif - shmem[elem_idx] = dl * f16(kvalues_iq4nl[nib]); - } +#if defined(INIT_SRC0_SHMEM_IQ3_S) +fn create_iq_gw4(ig: u32) -> vec4 { + return vec4( + f32(get_byte(iq3s_grid[ig], 0)), + f32(get_byte(iq3s_grid[ig], 1)), + f32(get_byte(iq3s_grid[ig], 2)), + f32(get_byte(iq3s_grid[ig], 3)), + ); } -#endif // INIT_SRC0_SHMEM_IQ4_XS +#endif -#ifdef INIT_SRC0_SHMEM_IQ1_S -const BLOCK_SIZE = 256u; -const BLOCK_SIZE_BYTES = 50u; +#if defined(INIT_SRC0_SHMEM_IQ2_XXS) || defined(INIT_SRC0_SHMEM_IQ2_XS) || defined(INIT_SRC0_SHMEM_IQ2_S) \ +|| defined(INIT_SRC0_SHMEM_IQ3_XXS) || defined(INIT_SRC0_SHMEM_IQ3_S) +fn create_iq2_m4(signs: u32, mask_phase: u32) -> vec4 { + return vec4( + select(1.0, -1.0, (get_byte(kmask_iq2xs[mask_phase], 0) & signs) != 0u), + select(1.0, -1.0, (get_byte(kmask_iq2xs[mask_phase], 1) & signs) != 0u), + select(1.0, -1.0, (get_byte(kmask_iq2xs[mask_phase], 2) & signs) != 0u), + select(1.0, -1.0, (get_byte(kmask_iq2xs[mask_phase], 3) & signs) != 0u), + ); +} +#endif fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u32) { - for (var elem_idx = thread_id; elem_idx < TILE_SRC0_SHMEM; elem_idx += TOTAL_WORKGROUP_SIZE) { + for (var elem_idx = thread_id * NQ; elem_idx < TILE_SRC0_SHMEM; elem_idx += NQ * TOTAL_WORKGROUP_SIZE) { let tile_m = elem_idx / TILE_K; let tile_k = elem_idx % TILE_K; let global_m = offset_m + tile_m; let global_k = k_outer + tile_k; if (global_m >= params.m || global_k >= params.k) { - shmem[elem_idx] = f16(0.0); + let zero_vec4 = vec4(f16(0.0), f16(0.0), f16(0.0), f16(0.0)); + store_shmem_iquants(zero_vec4, elem_idx + 0u); + store_shmem_iquants(zero_vec4, elem_idx + 4u); + store_shmem_iquants(zero_vec4, elem_idx + 8u); + store_shmem_iquants(zero_vec4, elem_idx + 12u); continue; } let block_k = global_k / BLOCK_SIZE; - let k_in_block = global_k % BLOCK_SIZE; + let k_in_block = global_k % BLOCK_SIZE; // k_in_block % 16 == 0; - let src0_idx = batch_offset + global_m * params.stride_01 + block_k; - let block_byte_base = src0_idx * BLOCK_SIZE_BYTES; - let d = load_f16_as_f32_at_src0(block_byte_base); + let src0_idx = batch_offset + global_m * params.stride_01 + block_k; - let ib = k_in_block / 32u; - let pos = k_in_block % 32u; - let l = pos / 8u; - let j = pos % 8u; +#if defined(INIT_SRC0_SHMEM_IQ4_XS) + let block_byte_base = src0_idx * 136u; // BLOCK_SIZE_BYTES = 136u; + let d_byte_base = block_byte_base + 0u; + let scales_l_byte_base = block_byte_base + 4u; + let qs_byte_base = block_byte_base + 8u; - let qh = load_u32_at_src0(block_byte_base + 34u + ib * 2u) & 0xFFFFu; - let dl = d * (2.0 * f32((qh >> 12u) & 7u) + 1.0); - let delta = select(IQ1_DELTA, -IQ1_DELTA, (qh & 0x8000u) != 0u); + let d_scales_h = load_u32_at_src0_aligned(d_byte_base); + let d = bitcast>(d_scales_h).x; + let scales_h = d_scales_h >> 16u; - let qs_w = load_u32_at_src0(block_byte_base + 2u + ib * 4u); - let ig = (get_byte(qs_w, l) | (((qh >> (3u * l)) & 7u) << 8u)) * 8u; + let sub_block = k_in_block / 32u; + let phase = (k_in_block / NQ) % 2u; - let gw = iq1_grid[(ig + j) / 16u]; - let g = (gw >> (((ig + j) % 16u) * 2u)) & 3u; - let gs = bitcast(g << 30u) >> 30u; + let scales_l_u32 = load_u32_at_src0_aligned(scales_l_byte_base); + let ls_lo = (get_byte(scales_l_u32, sub_block / 2u) >> (4u * (sub_block % 2u))) & 0xFu; + let ls_hi = ((scales_h >> (2u * sub_block)) & 3u) << 4u; + let dl = d * f16(i32(ls_lo | ls_hi) - 32); - shmem[elem_idx] = f16(dl * (f32(gs) + delta)); - } -} -#endif // INIT_SRC0_SHMEM_IQ1_S + let qs_0_3_u32 = load_u32_at_src0_aligned(qs_byte_base + 16u * sub_block + 0u); + let qs_4_7_u32 = load_u32_at_src0_aligned(qs_byte_base + 16u * sub_block + 4u); + let qs_8_11_u32 = load_u32_at_src0_aligned(qs_byte_base + 16u * sub_block + 8u); + let qs_12_15_u32 = load_u32_at_src0_aligned(qs_byte_base + 16u * sub_block + 12u); -#ifdef INIT_SRC0_SHMEM_IQ1_M -const BLOCK_SIZE = 256u; -const BLOCK_SIZE_BYTES = 56u; + store_shmem_iquants(create_iq_gw4(dl, qs_0_3_u32, phase), elem_idx + 0u); + store_shmem_iquants(create_iq_gw4(dl, qs_4_7_u32, phase), elem_idx + 4u); + store_shmem_iquants(create_iq_gw4(dl, qs_8_11_u32, phase), elem_idx + 8u); + store_shmem_iquants(create_iq_gw4(dl, qs_12_15_u32, phase), elem_idx + 12u); +#endif // INIT_SRC0_SHMEM_IQ4_XS -fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u32) { - for (var elem_idx = thread_id; elem_idx < TILE_SRC0_SHMEM; elem_idx += TOTAL_WORKGROUP_SIZE) { - let tile_m = elem_idx / TILE_K; - let tile_k = elem_idx % TILE_K; - let global_m = offset_m + tile_m; - let global_k = k_outer + tile_k; +#if defined(INIT_SRC0_SHMEM_IQ1_S) + let block_byte_base = src0_idx * 50u; // BLOCK_SIZE_BYTES = 50u; + let d_byte_base = block_byte_base + 0u; + let qs_byte_base = block_byte_base + 2u; + let qh_byte_base = block_byte_base + 34u; - if (global_m >= params.m || global_k >= params.k) { - shmem[elem_idx] = f16(0.0); - continue; - } + let d = load_f16_as_f32_at_src0(d_byte_base); - let block_k = global_k / BLOCK_SIZE; - let k_in_block = global_k % BLOCK_SIZE; + let sub_block = k_in_block / 32u; + let phase = (k_in_block / NQ) % 2u; - let src0_idx = batch_offset + global_m * params.stride_01 + block_k; - let block_byte_base = src0_idx * BLOCK_SIZE_BYTES; + let qh_u16 = load_u32_at_src0(qh_byte_base + sub_block * 2u) & 0xFFFFu; + let qs_u16 = load_u32_at_src0(qs_byte_base + sub_block * 4u + phase * 2u) & 0xFFFFu; + + let dl = d * (2.0 * f32((qh_u16 >> 12u) & 7u) + 1.0); + let delta = select(IQ1_DELTA, -IQ1_DELTA, (qh_u16 & 0x8000u) != 0u); + + let gp0_grid_id = ((qs_u16 & 0xFFu) | (((qh_u16 >> (phase * 6u)) & 7u) << 8u)) * 8u; + let gp1_grid_id = (((qs_u16 >> 8) & 0xFFu) | (((qh_u16 >> (phase * 6u + 3u)) & 7u) << 8u)) * 8u; + + let gp0_gw = iq1_grid[(gp0_grid_id) / 16u]; + let gp1_gw = iq1_grid[(gp1_grid_id) / 16u]; - let scales0 = load_u32_at_src0(block_byte_base + 48u); - let scales1 = load_u32_at_src0(block_byte_base + 52u); + let gp0_shift_base = (gp0_grid_id % 16u) * 2u; + let gp1_shift_base = (gp1_grid_id % 16u) * 2u; + + store_shmem_iquants(create_iq_gw4(dl, gp0_gw, gp0_shift_base + 0u, delta), elem_idx + 0u); + store_shmem_iquants(create_iq_gw4(dl, gp0_gw, gp0_shift_base + 8u, delta), elem_idx + 4u); + store_shmem_iquants(create_iq_gw4(dl, gp1_gw, gp1_shift_base + 0u, delta), elem_idx + 8u); + store_shmem_iquants(create_iq_gw4(dl, gp1_gw, gp1_shift_base + 8u, delta), elem_idx + 12u); +#endif // INIT_SRC0_SHMEM_IQ1_S + +#if defined(INIT_SRC0_SHMEM_IQ1_M) + let block_byte_base = src0_idx * 56u; // BLOCK_SIZE_BYTES = 56u; + let qs_byte_base = block_byte_base + 0u; + let qh_byte_base = block_byte_base + 32u; + let scales_byte_base = block_byte_base + 48u; + + let scales0 = load_u32_at_src0_aligned(scales_byte_base); + let scales1 = load_u32_at_src0_aligned(scales_byte_base + 4u); let scale_packed = ((scales0 >> 12u) & 0xFu) | ((scales0 >> 24u) & 0x00F0u) | ((scales1 >> 4u) & 0x0F00u) | ((scales1 >> 16u) & 0xF000u); let d = f32(bitcast>(scale_packed).x); - let ib = k_in_block / 32u; - let pos = k_in_block % 32u; - let l = pos / 8u; - let j = pos % 8u; + let sub_block = k_in_block / 32u; + let phase = (k_in_block / NQ) % 2u; - let scales = select(scales0, scales1, ib >= 4u); - let sw = (scales >> (16u * ((ib / 2u) % 2u))) & 0xFFFFu; - let s_pair = (sw >> (6u * (ib % 2u) + 3u * (l / 2u))) & 0x7u; - let dl = d * f32(2u * s_pair + 1u); + let scale_u32 = select(scales0, scales1, sub_block >= 4u); + let scale_u3 = (scale_u32 >> (16u * ((sub_block / 2u) % 2u) + 6u * (sub_block % 2u) + 3u * phase)) & 0x7u; + let dl = d * f32(2u * scale_u3 + 1u); - let qh_word = load_u32_at_src0(block_byte_base + 32u + (ib / 2u) * 4u); - let qh = qh_word >> (16u * (ib % 2u)); - let qh_nib = (qh >> (4u * l)) & 0xFu; + let qh_u8 = (load_u32_at_src0_aligned(qh_byte_base + 4u * (sub_block / 2u)) >> (16u * (sub_block % 2u) + 8u * phase)) & 0xFFu; + let qs_u16 = (load_u32_at_src0_aligned(qs_byte_base + 4u * sub_block) >> (16u * phase)) & 0xFFFFu; - let qs_w = load_u32_at_src0(block_byte_base + ib * 4u); - let idx = get_byte(qs_w, l) | ((qh_nib & 7u) << 8u); - let delta = select(IQ1_DELTA, -IQ1_DELTA, (qh_nib & 0x8u) != 0u); + let gp0_grid_id = ((qs_u16 & 0xFFu) | ((qh_u8 & 7u) << 8u)) * 8u; + let gp0_delta = select(IQ1_DELTA, -IQ1_DELTA, (qh_u8 & 0x8u) != 0u); - let ig = idx * 8u; - let gw = iq1_grid[(ig + j) / 16u]; - let g = (gw >> (((ig + j) % 16u) * 2u)) & 3u; - let gs = bitcast(g << 30u) >> 30u; + let gp1_grid_id = (((qs_u16 >> 8u) & 0xFFu) | (((qh_u8 >> 4u) & 7u) << 8u)) * 8u; + let gp1_delta = select(IQ1_DELTA, -IQ1_DELTA, (qh_u8 & 0x80u) != 0u); - shmem[elem_idx] = f16(dl * (f32(gs) + delta)); - } -} -#endif // INIT_SRC0_SHMEM_IQ1_M + let gp0_gw = iq1_grid[(gp0_grid_id) / 16u]; + let gp1_gw = iq1_grid[(gp1_grid_id) / 16u]; -#ifdef INIT_SRC0_SHMEM_IQ2_XXS -const BLOCK_SIZE = 256u; -const BLOCK_SIZE_BYTES = 66u; + let gp0_shift_base = (gp0_grid_id % 16u) * 2u; + let gp1_shift_base = (gp1_grid_id % 16u) * 2u; -fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u32) { - for (var elem_idx = thread_id; elem_idx < TILE_SRC0_SHMEM; elem_idx += TOTAL_WORKGROUP_SIZE) { - let tile_m = elem_idx / TILE_K; - let tile_k = elem_idx % TILE_K; - let global_m = offset_m + tile_m; - let global_k = k_outer + tile_k; + store_shmem_iquants(create_iq_gw4(dl, gp0_gw, gp0_shift_base + 0u, gp0_delta), elem_idx + 0u); + store_shmem_iquants(create_iq_gw4(dl, gp0_gw, gp0_shift_base + 8u, gp0_delta), elem_idx + 4u); + store_shmem_iquants(create_iq_gw4(dl, gp1_gw, gp1_shift_base + 0u, gp1_delta), elem_idx + 8u); + store_shmem_iquants(create_iq_gw4(dl, gp1_gw, gp1_shift_base + 8u, gp1_delta), elem_idx + 12u); +#endif // INIT_SRC0_SHMEM_IQ1_M - if (global_m >= params.m || global_k >= params.k) { - shmem[elem_idx] = f16(0.0); - continue; - } +#if defined(INIT_SRC0_SHMEM_IQ2_XXS) + let block_byte_base = src0_idx * 66u; // BLOCK_SIZE_BYTES = 66u; + let d_byte_base = block_byte_base + 0u; + let qs_byte_base = block_byte_base + 2u; - let block_k = global_k / BLOCK_SIZE; - let k_in_block = global_k % BLOCK_SIZE; + let d = load_f16_as_f32_at_src0(d_byte_base); - let src0_idx = batch_offset + global_m * params.stride_01 + block_k; - let block_byte_base = src0_idx * BLOCK_SIZE_BYTES; - let d = load_f16_as_f32_at_src0(block_byte_base); + let sub_block = k_in_block / 32u; + let phase = (k_in_block / NQ) % 2u; - let entry_idx = k_in_block / 8u; - let j = k_in_block % 8u; + let aux0 = load_u32_at_src0(qs_byte_base + 8u * sub_block + 0u); + let aux1 = load_u32_at_src0(qs_byte_base + 8u * sub_block + 4u); + let db = d * (0.5 + f32(aux1 >> 28u)) * 0.25; - let ib = entry_idx & ~3u; - let l = entry_idx & 3u; + let gp0_ig = get_byte(aux0, 2u * phase + 0u) * 8u; + let gp1_ig = get_byte(aux0, 2u * phase + 1u) * 8u; - let aux0 = load_u32_at_src0(block_byte_base + 2u + ib * 2u); - let aux1 = load_u32_at_src0(block_byte_base + 2u + (ib + 2u) * 2u); - let db = d * (0.5 + f32(aux1 >> 28u)) * 0.25; + let gp0_is = (aux1 >> (14u * phase + 0u)) & 127u; + let gp1_is = (aux1 >> (14u * phase + 7u)) & 127u; - let ig = get_byte(aux0, l) * 8u; - let is = (aux1 >> (7u * l)) & 127u; - let signs = get_byte(ksigns_iq2xs[is / 4u], is % 4u); + let gp0_signs = get_byte(ksigns_iq2xs[gp0_is / 4u], gp0_is % 4u); + let gp1_signs = get_byte(ksigns_iq2xs[gp1_is / 4u], gp1_is % 4u); - let g = get_byte(iq2xxs_grid[(ig + j) / 4u], (ig + j) % 4u); - let m = select(1.0, -1.0, (get_byte(kmask_iq2xs[j / 4u], j % 4u) & signs) != 0u); + let m_0_3_val4 = create_iq2_m4(gp0_signs, 0); + let m_4_7_val4 = create_iq2_m4(gp0_signs, 1); + let m_8_11_val4 = create_iq2_m4(gp1_signs, 0); + let m_12_15_val4 = create_iq2_m4(gp1_signs, 1); - shmem[elem_idx] = f16(db * f32(g) * m); - } -} + let gw_0_3_val4 = create_iq_gw4(gp0_ig, 0); + let gw_4_7_val4 = create_iq_gw4(gp0_ig, 4); + let gw_8_11_val4 = create_iq_gw4(gp1_ig, 0); + let gw_12_15_val4 = create_iq_gw4(gp1_ig, 4); + + store_shmem_iquants(vec4(db * m_0_3_val4 * gw_0_3_val4), elem_idx + 0u); + store_shmem_iquants(vec4(db * m_4_7_val4 * gw_4_7_val4), elem_idx + 4u); + store_shmem_iquants(vec4(db * m_8_11_val4 * gw_8_11_val4), elem_idx + 8u); + store_shmem_iquants(vec4(db * m_12_15_val4 * gw_12_15_val4), elem_idx + 12u); #endif // INIT_SRC0_SHMEM_IQ2_XXS -#ifdef INIT_SRC0_SHMEM_IQ2_XS -const BLOCK_SIZE = 256u; -const BLOCK_SIZE_BYTES = 74u; +#if defined(INIT_SRC0_SHMEM_IQ2_XS) + let block_byte_base = src0_idx * 74u; // BLOCK_SIZE_BYTES = 74u; + let d_byte_base = block_byte_base + 0u; + let qs_byte_base = block_byte_base + 2u; + let scales_byte_base = block_byte_base + 66u; -fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u32) { - for (var elem_idx = thread_id; elem_idx < TILE_SRC0_SHMEM; elem_idx += TOTAL_WORKGROUP_SIZE) { - let tile_m = elem_idx / TILE_K; - let tile_k = elem_idx % TILE_K; - let global_m = offset_m + tile_m; - let global_k = k_outer + tile_k; + let d = load_f16_as_f32_at_src0(d_byte_base); - if (global_m >= params.m || global_k >= params.k) { - shmem[elem_idx] = f16(0.0); - continue; - } + let sub_block = k_in_block / 32u; + let phase = (k_in_block / NQ) % 2u; - let block_k = global_k / BLOCK_SIZE; - let k_in_block = global_k % BLOCK_SIZE; + let scale = (load_byte_at_src0_aligned(scales_byte_base + 1u * sub_block) >> (4u * phase)) & 0xFu; + let db = d * (0.5 + f32(scale)) * 0.25; - let src0_idx = batch_offset + global_m * params.stride_01 + block_k; - let block_byte_base = src0_idx * BLOCK_SIZE_BYTES; - let d = load_f16_as_f32_at_src0(block_byte_base); + let qs_u32 = load_u32_at_src0(qs_byte_base + 8u * sub_block + 4u * phase); - let entry_idx = k_in_block / 8u; - let j = k_in_block % 8u; + let gp0_ig = (qs_u32 & 0x1FFu) * 8u; + let gp1_ig = ((qs_u32 >> 16u) & 0x1FFu) * 8u; - let ib = entry_idx & ~3u; - let l = entry_idx & 3u; + let gp0_is = (qs_u32 >> 9u) & 0x7Fu; + let gp1_is = (qs_u32 >> 25u) & 0x7Fu; - let scales_word = load_u32_at_src0(block_byte_base + 66u + (ib / 16u) * 4u); - let s = get_byte(scales_word, (ib % 16u) / 4u); - let s_nib = select(s & 0xFu, (s >> 4u) & 0xFu, (l / 2u) != 0u); - let dl = d * (0.5 + f32(s_nib)) * 0.25; + let gp0_signs = get_byte(ksigns_iq2xs[gp0_is / 4u], gp0_is % 4u); + let gp1_signs = get_byte(ksigns_iq2xs[gp1_is / 4u], gp1_is % 4u); - let qs_word = load_u32_at_src0(block_byte_base + 2u + (ib + l) * 2u); - let qs_val = qs_word & 0xFFFFu; - let ig = (qs_val & 511u) * 8u; - let is = qs_val >> 9u; - let signs = get_byte(ksigns_iq2xs[is / 4u], is % 4u); + let m_0_3_val4 = create_iq2_m4(gp0_signs, 0); + let m_4_7_val4 = create_iq2_m4(gp0_signs, 1); + let m_8_11_val4 = create_iq2_m4(gp1_signs, 0); + let m_12_15_val4 = create_iq2_m4(gp1_signs, 1); - let g = get_byte(iq2xs_grid[(ig + j) / 4u], (ig + j) % 4u); - let m = select(1.0, -1.0, (get_byte(kmask_iq2xs[j / 4u], j % 4u) & signs) != 0u); + let gw_0_3_val4 = create_iq_gw4(gp0_ig, 0); + let gw_4_7_val4 = create_iq_gw4(gp0_ig, 4); + let gw_8_11_val4 = create_iq_gw4(gp1_ig, 0); + let gw_12_15_val4 = create_iq_gw4(gp1_ig, 4); - shmem[elem_idx] = f16(dl * f32(g) * m); - } -} + store_shmem_iquants(vec4(db * m_0_3_val4 * gw_0_3_val4), elem_idx + 0u); + store_shmem_iquants(vec4(db * m_4_7_val4 * gw_4_7_val4), elem_idx + 4u); + store_shmem_iquants(vec4(db * m_8_11_val4 * gw_8_11_val4), elem_idx + 8u); + store_shmem_iquants(vec4(db * m_12_15_val4 * gw_12_15_val4), elem_idx + 12u); #endif // INIT_SRC0_SHMEM_IQ2_XS -#ifdef INIT_SRC0_SHMEM_IQ2_S -const BLOCK_SIZE = 256u; -const BLOCK_SIZE_BYTES = 82u; +#if defined(INIT_SRC0_SHMEM_IQ2_S) + let block_byte_base = src0_idx * 82u; // BLOCK_SIZE_BYTES = 82u; + let d_byte_base = block_byte_base + 0u; + let qs_byte_base = block_byte_base + 2u; + let qh_byte_base = block_byte_base + 66u; + let scales_byte_base = block_byte_base + 74u; -fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u32) { - for (var elem_idx = thread_id; elem_idx < TILE_SRC0_SHMEM; elem_idx += TOTAL_WORKGROUP_SIZE) { - let tile_m = elem_idx / TILE_K; - let tile_k = elem_idx % TILE_K; - let global_m = offset_m + tile_m; - let global_k = k_outer + tile_k; + let d = load_f16_as_f32_at_src0(d_byte_base); - if (global_m >= params.m || global_k >= params.k) { - shmem[elem_idx] = f16(0.0); - continue; - } + let sub_block = k_in_block / 32u; + let phase = (k_in_block / NQ) % 2u; - let block_k = global_k / BLOCK_SIZE; - let k_in_block = global_k % BLOCK_SIZE; - - let src0_idx = batch_offset + global_m * params.stride_01 + block_k; - let block_byte_base = src0_idx * BLOCK_SIZE_BYTES; - let d = load_f16_as_f32_at_src0(block_byte_base); + let scale = (load_byte_at_src0_aligned(scales_byte_base + 1u * sub_block) >> (4u * phase)) & 0xFu; + let db = d * (0.5 + f32(scale)) * 0.25; - let ib = k_in_block / 32u; - let l = (k_in_block % 32u) / 8u; - let j = k_in_block % 8u; + let qs_u16 = load_u32_at_src0(qs_byte_base + 4u * sub_block + 2u * phase) & 0xFFFFu; + let signs_u16 = load_u32_at_src0(qs_byte_base + 32u + 4u * sub_block + 2u * phase) & 0xFFFFu; + let qh_u4 = (load_byte_at_src0_aligned(qh_byte_base + 1u * sub_block) >> (4u * phase)) & 0xFu; - let scales_word = load_u32_at_src0(block_byte_base + 74u + (ib / 4u) * 4u); - let s = get_byte(scales_word, ib % 4u); - let s_nib = select(s & 0xFu, (s >> 4u) & 0xFu, (l / 2u) != 0u); - let dl = d * (0.5 + f32(s_nib)) * 0.25; + let gp0_ig = ((qs_u16 & 0xFFu) | ((qh_u4 & 0x3u) << 8u)) * 8u; + let gp1_ig = (((qs_u16 >> 8u) & 0xFFu) | ((qh_u4 & 0xCu) << 6u)) * 8u; - let qs_word = load_u32_at_src0(block_byte_base + 2u + ib * 4u); - let qh_word = load_u32_at_src0(block_byte_base + 66u + (ib / 4u) * 4u); - let qh_b = (get_byte(qh_word, ib % 4u) << (8u - 2u * l)) & 0x300u; - let ig = (get_byte(qs_word, l) | qh_b) * 8u; + let gp0_signs = get_byte(signs_u16, 0); + let gp1_signs = get_byte(signs_u16, 1); - let signs_word = load_u32_at_src0(block_byte_base + 34u + ib * 4u); - let signs = get_byte(signs_word, l); + let m_0_3_val4 = create_iq2_m4(gp0_signs, 0); + let m_4_7_val4 = create_iq2_m4(gp0_signs, 1); + let m_8_11_val4 = create_iq2_m4(gp1_signs, 0); + let m_12_15_val4 = create_iq2_m4(gp1_signs, 1); - let g = get_byte(iq2s_grid[(ig + j) / 4u], (ig + j) % 4u); - let m = select(1.0, -1.0, (get_byte(kmask_iq2xs[j / 4u], j % 4u) & signs) != 0u); + let gw_0_3_val4 = create_iq_gw4(gp0_ig, 0); + let gw_4_7_val4 = create_iq_gw4(gp0_ig, 4); + let gw_8_11_val4 = create_iq_gw4(gp1_ig, 0); + let gw_12_15_val4 = create_iq_gw4(gp1_ig, 4); - shmem[elem_idx] = f16(dl * f32(g) * m); - } -} + store_shmem_iquants(vec4(db * m_0_3_val4 * gw_0_3_val4), elem_idx + 0u); + store_shmem_iquants(vec4(db * m_4_7_val4 * gw_4_7_val4), elem_idx + 4u); + store_shmem_iquants(vec4(db * m_8_11_val4 * gw_8_11_val4), elem_idx + 8u); + store_shmem_iquants(vec4(db * m_12_15_val4 * gw_12_15_val4), elem_idx + 12u); #endif // INIT_SRC0_SHMEM_IQ2_S -#ifdef INIT_SRC0_SHMEM_IQ3_XXS -const BLOCK_SIZE = 256u; -const BLOCK_SIZE_BYTES = 98u; +#if defined(INIT_SRC0_SHMEM_IQ3_XXS) + let block_byte_base = src0_idx * 98u; // BLOCK_SIZE_BYTES = 98u; + let d_byte_base = block_byte_base + 0u; + let qs_byte_base = block_byte_base + 2u; -fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u32) { - for (var elem_idx = thread_id; elem_idx < TILE_SRC0_SHMEM; elem_idx += TOTAL_WORKGROUP_SIZE) { - let tile_m = elem_idx / TILE_K; - let tile_k = elem_idx % TILE_K; - let global_m = offset_m + tile_m; - let global_k = k_outer + tile_k; + let d = load_f16_as_f32_at_src0(d_byte_base); - if (global_m >= params.m || global_k >= params.k) { - shmem[elem_idx] = f16(0.0); - continue; - } + let sub_block = k_in_block / 32u; + let phase = (k_in_block / NQ) % 2u; - let block_k = global_k / BLOCK_SIZE; - let k_in_block = global_k % BLOCK_SIZE; + let qs_u32 = load_u32_at_src0(qs_byte_base + 8u * sub_block + 4u * phase); + let sign_u32 = load_u32_at_src0(qs_byte_base + 64u + 4u * sub_block); + let db = d * (0.5 + f32(sign_u32 >> 28u)) * 0.5; - let src0_idx = batch_offset + global_m * params.stride_01 + block_k; - let block_byte_base = src0_idx * BLOCK_SIZE_BYTES; - let d = load_f16_as_f32_at_src0(block_byte_base); - - let ib_pair = k_in_block / 32u; - let in_pair = k_in_block % 32u; - let l = in_pair / 8u; - let in_l = in_pair % 8u; - let k2 = in_l / 4u; - let j = in_l % 4u; - - let ib = ib_pair * 2u; - let sc_sign_off = block_byte_base + 2u + (ib + 32u) * 2u; - let sc_sign = load_u32_at_src0(sc_sign_off); - let db = d * (0.5 + f32(sc_sign >> 28u)) * 0.5; - let is = (sc_sign >> (7u * l)) & 127u; - let signs = get_byte(ksigns_iq2xs[is / 4u], is % 4u); - - let ig_word = load_u32_at_src0(block_byte_base + 2u + (ib * 2u + l) * 2u) & 0xFFFFu; - let ig_byte = get_byte(ig_word, k2); - let g = get_byte(iq3xxs_grid[ig_byte], j); - let m = select(1.0, -1.0, (get_byte(kmask_iq2xs[k2], j) & signs) != 0u); - - shmem[elem_idx] = f16(db * f32(g) * m); - } -} -#endif // INIT_SRC0_SHMEM_IQ3_XXS + let ig_0_3 = get_byte(qs_u32, 0); + let ig_4_7 = get_byte(qs_u32, 1); + let ig_8_11 = get_byte(qs_u32, 2); + let ig_12_15 = get_byte(qs_u32, 3); -#ifdef INIT_SRC0_SHMEM_IQ3_S -const BLOCK_SIZE = 256u; -const BLOCK_SIZE_BYTES = 110u; - -fn init_shmem_src0(thread_id: u32, batch_offset: u32, offset_m: u32, k_outer: u32) { - for (var elem_idx = thread_id; elem_idx < TILE_SRC0_SHMEM; elem_idx += TOTAL_WORKGROUP_SIZE) { - let tile_m = elem_idx / TILE_K; - let tile_k = elem_idx % TILE_K; - let global_m = offset_m + tile_m; - let global_k = k_outer + tile_k; - - if (global_m >= params.m || global_k >= params.k) { - shmem[elem_idx] = f16(0.0); - continue; - } - - let block_k = global_k / BLOCK_SIZE; - let k_in_block = global_k % BLOCK_SIZE; - - let src0_idx = batch_offset + global_m * params.stride_01 + block_k; - let block_byte_base = src0_idx * BLOCK_SIZE_BYTES; - let d = load_f16_as_f32_at_src0(block_byte_base); - - let ib = k_in_block / 64u; - let rest = k_in_block % 64u; - let k = rest / 32u; - let in_k = rest % 32u; - let l = in_k / 8u; - let in_l = in_k % 8u; - let k2 = in_l / 4u; - let j = in_l % 4u; + let gp0_is = (sign_u32 >> (14u * phase + 0u)) & 0x7Fu; + let gp1_is = (sign_u32 >> (14u * phase + 7u)) & 0x7Fu; - let scales_word = load_u32_at_src0(block_byte_base + 106u); - let s = get_byte(scales_word, ib); - let s_nib = select(s & 0xFu, (s >> 4u) & 0xFu, k != 0u); - let dl = d * (1.0 + 2.0 * f32(s_nib)); + let gp0_signs = get_byte(ksigns_iq2xs[gp0_is / 4u], gp0_is % 4u); + let gp1_signs = get_byte(ksigns_iq2xs[gp1_is / 4u], gp1_is % 4u); - let qh_word = load_u32_at_src0(block_byte_base + 66u + (ib / 2u) * 4u); - let qh_byte = get_byte(qh_word, (ib % 2u) * 2u + k); + let m_0_3_val4 = create_iq2_m4(gp0_signs, 0); + let m_4_7_val4 = create_iq2_m4(gp0_signs, 1); + let m_8_11_val4 = create_iq2_m4(gp1_signs, 0); + let m_12_15_val4 = create_iq2_m4(gp1_signs, 1); - let ig_word = load_u32_at_src0(block_byte_base + 2u + (ib * 8u + k * 4u + l) * 2u) & 0xFFFFu; - let ig_lo = get_byte(ig_word, 0u) | ((qh_byte << (8u - 2u * l)) & 256u); - let ig_hi = get_byte(ig_word, 1u) | ((qh_byte << (7u - 2u * l)) & 256u); - let ig = select(ig_lo, ig_hi, k2 != 0u); + let gw_0_3_val4 = create_iq_gw4(ig_0_3); + let gw_4_7_val4 = create_iq_gw4(ig_4_7); + let gw_8_11_val4 = create_iq_gw4(ig_8_11); + let gw_12_15_val4 = create_iq_gw4(ig_12_15); - let signs_word = load_u32_at_src0(block_byte_base + 74u + (ib * 2u + k) * 4u); - let signs = get_byte(signs_word, l); - - let g = get_byte(iq3s_grid[ig], j); - let m = select(1.0, -1.0, (get_byte(kmask_iq2xs[k2], j) & signs) != 0u); + store_shmem_iquants(vec4(db * m_0_3_val4 * gw_0_3_val4), elem_idx + 0u); + store_shmem_iquants(vec4(db * m_4_7_val4 * gw_4_7_val4), elem_idx + 4u); + store_shmem_iquants(vec4(db * m_8_11_val4 * gw_8_11_val4), elem_idx + 8u); + store_shmem_iquants(vec4(db * m_12_15_val4 * gw_12_15_val4), elem_idx + 12u); +#endif // INIT_SRC0_SHMEM_IQ3_XXS - shmem[elem_idx] = f16(dl * f32(g) * m); +#if defined(INIT_SRC0_SHMEM_IQ3_S) + let block_byte_base = src0_idx * 110u; // BLOCK_SIZE_BYTES = 110u; + let d_byte_base = block_byte_base + 0u; + let qs_byte_base = block_byte_base + 2u; + let qh_byte_base = block_byte_base + 66u; + let signs_byte_base = block_byte_base + 74u; + let scales_byte_base = block_byte_base + 106u; + + let d = load_f16_as_f32_at_src0(d_byte_base); + + let sub_block = k_in_block / 32u; + let phase = (k_in_block / NQ) % 2u; + + let scale = (load_byte_at_src0_aligned(scales_byte_base + 1u * (sub_block / 2u)) >> (4u * (sub_block % 2u))) & 0xFu; + let db = d * (1.0 + 2.0 * f32(scale)); + + let qs_u32 = load_u32_at_src0(qs_byte_base + 8u * sub_block + 4u * phase); + let qh_u4 = (load_byte_at_src0_aligned(qh_byte_base + 1u * sub_block) >> (4u * phase)) & 0xFu; + let signs_u16 = (load_u32_at_src0(signs_byte_base + 4u * sub_block + 2u * phase)) & 0xFFFFu; + + let ig_0_3 = ((qs_u32 >> 0u) & 0xFFu) | ((qh_u4 & 0x1u) << 8u); + let ig_4_7 = ((qs_u32 >> 8u) & 0xFFu) | ((qh_u4 & 0x2u) << 7u); + let ig_8_11 = ((qs_u32 >> 16u) & 0xFFu) | ((qh_u4 & 0x4u) << 6u); + let ig_12_15 = ((qs_u32 >> 24u) & 0xFFu) | ((qh_u4 & 0x8u) << 5u); + + let gp0_signs = get_byte(signs_u16, 0); + let gp1_signs = get_byte(signs_u16, 1); + + let m_0_3_val4 = create_iq2_m4(gp0_signs, 0); + let m_4_7_val4 = create_iq2_m4(gp0_signs, 1); + let m_8_11_val4 = create_iq2_m4(gp1_signs, 0); + let m_12_15_val4 = create_iq2_m4(gp1_signs, 1); + + let gw_0_3_val4 = create_iq_gw4(ig_0_3); + let gw_4_7_val4 = create_iq_gw4(ig_4_7); + let gw_8_11_val4 = create_iq_gw4(ig_8_11); + let gw_12_15_val4 = create_iq_gw4(ig_12_15); + + store_shmem_iquants(vec4(db * m_0_3_val4 * gw_0_3_val4), elem_idx + 0u); + store_shmem_iquants(vec4(db * m_4_7_val4 * gw_4_7_val4), elem_idx + 4u); + store_shmem_iquants(vec4(db * m_8_11_val4 * gw_8_11_val4), elem_idx + 8u); + store_shmem_iquants(vec4(db * m_12_15_val4 * gw_12_15_val4), elem_idx + 12u); +#endif // INIT_SRC0_SHMEM_IQ3_S } } -#endif // INIT_SRC0_SHMEM_IQ3_S +#endif // i-quants (super block size: 256) diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_vec.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_vec.wgsl index 6ff9bcf2df00..78ae955e6bad 100644 --- a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_vec.wgsl +++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_id_vec.wgsl @@ -103,7 +103,7 @@ fn main( #ifdef USE_SUBGROUP_REDUCTION for (var row = 0u; row < OUTPUTS_PER_WG; row++) { - let subgroup_total = subgroupAdd(acc[row]); + let subgroup_total = subgroupAdd(acc[0][row]); if (subgroup_invocation_id == 0u) { partial_sums[partial_index(row, subgroup_id)] = subgroup_total; } @@ -126,7 +126,7 @@ fn main( #ifdef USE_WORKGROUP_REDUCTION for (var row = 0u; row < OUTPUTS_PER_WG; row++) { - partial_sums[partial_index(row, thread_id)] = acc[row]; + partial_sums[partial_index(row, thread_id)] = acc[0][row]; } workgroupBarrier(); diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl index f0a7fbd059a8..ebdf09513e22 100644 --- a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl +++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec.wgsl @@ -91,61 +91,67 @@ fn main( let dst_idx_base = params.offset_dst + dst3_idx * dst3_stride + dst2_idx * dst2_stride + row_base; #ifdef MMVQ - let src1q_idx_base = (src13_idx * params.bs02 * params.broadcast2 + src12_idx) * (params.k / 32u); + let src1q_idx_base = (src13_idx * params.bs02 * params.broadcast2 + src12_idx) * params.n * (params.k / 32u); let acc = accumulate_vec_q_dot(thread_id, row_base, src0_batch_offset, src1q_idx_base); #else let src1_idx_base = params.offset_src1 + src13_idx * params.stride_13 + src12_idx * params.stride_12; let acc = accumulate_vec_dot(thread_id, row_base, src0_batch_offset, src1_idx_base); #endif -#ifdef USE_SUBGROUP_REDUCTION - for (var row = 0u; row < OUTPUTS_PER_WG; row++) { - let subgroup_total = subgroupAdd(acc[row]); - if (subgroup_invocation_id == 0u) { - partial_sums[partial_index(row, subgroup_id)] = subgroup_total; - } - } + for (var col = 0u;col < NUM_COLS;col += 1) { - workgroupBarrier(); +#ifdef USE_SUBGROUP_REDUCTION + for (var row = 0u; row < OUTPUTS_PER_WG; row++) { + let subgroup_total = subgroupAdd(acc[col][row]); + if (subgroup_invocation_id == 0u) { + partial_sums[partial_index(row, subgroup_id)] = subgroup_total; + } + } - for (var row = subgroup_id; (row < OUTPUTS_PER_WG) && (row_base + row < params.m); row += num_subgroups) { - let output_row = row_base + row; - var row_acc = 0.0f; - for (var k = subgroup_invocation_id; k < num_subgroups; k += subgroup_size) { - row_acc += partial_sums[partial_index(row, k)]; - } - let row_total = subgroupAdd(row_acc); - if (subgroup_invocation_id == 0) { - dst[dst_idx_base + row] = row_total; - } - } + workgroupBarrier(); + + for (var row = subgroup_id; (row < OUTPUTS_PER_WG) && (row_base + row < params.m); row += num_subgroups) { + let output_row = row_base + row; + var row_acc = 0.0f; + for (var k = subgroup_invocation_id; k < num_subgroups; k += subgroup_size) { + row_acc += partial_sums[partial_index(row, k)]; + } + let row_total = subgroupAdd(row_acc); + if (subgroup_invocation_id == 0) { + dst[dst_idx_base + col * params.m + row] = row_total; + } + } #endif #ifdef USE_WORKGROUP_REDUCTION - for (var row = 0u; row < OUTPUTS_PER_WG; row++) { - partial_sums[partial_index(row, thread_id)] = acc[row]; - } + for (var row = 0u; row < OUTPUTS_PER_WG; row++) { + partial_sums[partial_index(row, thread_id)] = acc[col][row]; + } - workgroupBarrier(); + workgroupBarrier(); - var stride = WG_SIZE / 2u; + var stride = WG_SIZE / 2u; - while (stride > 0) { - if (thread_id < stride) { - for (var row = 0u; row < OUTPUTS_PER_WG; row++) { - partial_sums[partial_index(row, thread_id)] += partial_sums[partial_index(row, thread_id + stride)]; + while (stride > 0) { + if (thread_id < stride) { + for (var row = 0u; row < OUTPUTS_PER_WG; row++) { + partial_sums[partial_index(row, thread_id)] += partial_sums[partial_index(row, thread_id + stride)]; + } + } + + workgroupBarrier(); + stride = stride / 2; } - } - workgroupBarrier(); - stride = stride / 2; - } + if (thread_id < OUTPUTS_PER_WG) { + let output_row = row_base + thread_id; + if (output_row < params.m) { + dst[dst_idx_base + col * params.m + thread_id] = partial_sums[partial_index(thread_id, 0)]; + } + } +#endif + + workgroupBarrier(); - if (thread_id < OUTPUTS_PER_WG) { - let output_row = row_base + thread_id; - if (output_row < params.m) { - dst[dst_idx_base + thread_id] = partial_sums[partial_index(thread_id, 0)]; - } } -#endif } diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_acc.tmpl b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_acc.tmpl index 08753b9d643f..b0703fe9062d 100644 --- a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_acc.tmpl +++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_acc.tmpl @@ -32,8 +32,8 @@ fn inner_dot(src0_val: SRC0_TYPE, src1_val: SRC1_TYPE) -> f32 { #endif #ifdef MUL_ACC_FLOAT -fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1_idx_base: u32) -> array { - var acc: array; +fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1_idx_base: u32) -> array, NUM_COLS> { + var acc: array, NUM_COLS>; let k_vec = params.k / VEC_SIZE; let src1_idx_base_vec = src1_idx_base / VEC_SIZE; @@ -41,12 +41,18 @@ fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src // Each thread walks K, loads from the vector, and updates // a small block of output rows held in registers. for (var k = thread_id; k < k_vec; k += WG_SIZE) { - let x = src1[src1_idx_base_vec + k]; + var x_vals: array; + for (var col = 0u;col < NUM_COLS;col += 1) { + x_vals[col] = src1[src1_idx_base_vec + col * (params.stride_11 / VEC_SIZE) + k]; + } for (var row = 0u; row < OUTPUTS_PER_WG; row++) { let output_row = row_base + row; if (output_row < params.m) { let src0_idx = (src0_batch_offset + output_row * params.stride_01) / VEC_SIZE + k; - acc[row] += inner_dot(src0[src0_idx], x); + let w = src0[src0_idx]; + for (var col = 0u;col < NUM_COLS;col += 1) { + acc[col][row] += inner_dot(w, x_vals[col]); + } } } } @@ -60,30 +66,33 @@ fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src #define BLOCK_SIZE_BYTES 18 #define THREADS_PER_BLOCK 16 #define ELEMS_PER_THREAD (BLOCK_SIZE/THREADS_PER_BLOCK) -fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1_idx_base: u32) -> array { - var acc: array; +fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1_idx_base: u32) -> array, NUM_COLS> { + var acc: array, NUM_COLS>; let num_blocks = params.k / BLOCK_SIZE; let thread_within_block = thread_id % THREADS_PER_BLOCK; for (var block = thread_id / THREADS_PER_BLOCK; block < num_blocks; block += WG_SIZE / THREADS_PER_BLOCK) { let x_base = src1_idx_base + block * BLOCK_SIZE + thread_within_block * ELEMS_PER_THREAD; - var x_block: array; - for (var i = 0u; i < ELEMS_PER_THREAD; i++) { - x_block[i] = f32(src1[x_base + i]); + var x_block: array, NUM_COLS>; + for (var col = 0u; col < NUM_COLS;col += 1) { + for (var i = 0u; i < ELEMS_PER_THREAD; i++) { + x_block[col][i] = f32(src1[x_base + col * params.stride_11 + i]); + } } - for (var row = 0u; row < OUTPUTS_PER_WG; row++) { let output_row = row_base + row; if (output_row < params.m) { let block_byte_base = (src0_batch_offset + output_row * params.stride_01 + block) * BLOCK_SIZE_BYTES; let d = f32(load_f16_at_src0(block_byte_base)); let q_byte = load_u32_at_src0(block_byte_base + 2u + thread_within_block) & 0xFFu; - var row_sum = 0.0; - for (var bit = 0u; bit < 8u; bit++) { - let w = select(-d, d, ((q_byte >> bit) & 1u) != 0u); - row_sum += w * x_block[bit]; + for (var col = 0u;col < NUM_COLS;col += 1) { + var row_sum = 0.0; + for (var bit = 0u; bit < 8u; bit++) { + let w = select(-d, d, ((q_byte >> bit) & 1u) != 0u); + row_sum += w * x_block[col][bit]; + } + acc[col][row] += row_sum; } - acc[row] += row_sum; } } } @@ -97,35 +106,37 @@ fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src #define BLOCK_SIZE_BYTES 18 #define THREADS_PER_BLOCK 4 #define ELEMS_PER_THREAD (BLOCK_SIZE/THREADS_PER_BLOCK) -fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1_idx_base: u32) -> array { - var acc: array; +fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1_idx_base: u32) -> array, NUM_COLS> { + var acc: array, NUM_COLS>; let num_blocks = params.k / BLOCK_SIZE; let thread_within_block = thread_id % 4; for (var block = thread_id/THREADS_PER_BLOCK; block < num_blocks; block += WG_SIZE/THREADS_PER_BLOCK) { let x_base = src1_idx_base + block * BLOCK_SIZE + thread_within_block * 4; - var x_block: array; - for (var i = 0u; i < ELEMS_PER_THREAD / 2; i++) { - x_block[i] = f32(src1[x_base + i]); - x_block[i + 4] = f32(src1[x_base + i + 16]); + var x_block: array, NUM_COLS>; + for (var col = 0u; col < NUM_COLS;col += 1) { + for (var i = 0u; i < ELEMS_PER_THREAD / 2; i++) { + x_block[col][i] = f32(src1[x_base + col * params.stride_11 + i]); + x_block[col][i + 4] = f32(src1[x_base + col * params.stride_11 + i + 16]); + } } - for (var row = 0u; row < OUTPUTS_PER_WG; row++) { let output_row = row_base + row; if (output_row < params.m) { let block_byte_base = (src0_batch_offset + output_row * params.stride_01 + block) * BLOCK_SIZE_BYTES; let d = f32(load_f16_at_src0(block_byte_base)); - var row_sum = 0.0; - let q_packed = load_u32_at_src0(block_byte_base + 2u + 4u * thread_within_block); - for (var byte_idx = 0u; byte_idx < 4u; byte_idx++) { - let q_byte = get_byte(q_packed, byte_idx); - let q_lo = (f32(q_byte & 0xFu) - 8.0) * d; - let q_hi = (f32((q_byte >> 4u) & 0xFu) - 8.0) * d; - row_sum += q_lo * x_block[byte_idx]; - row_sum += q_hi * x_block[byte_idx + 4u]; + for (var col = 0u;col < NUM_COLS;col += 1) { + var row_sum = 0.0; + for (var byte_idx = 0u; byte_idx < 4u; byte_idx++) { + let q_byte = get_byte(q_packed, byte_idx); + let q_lo = (f32(q_byte & 0xFu) - 8.0) * d; + let q_hi = (f32((q_byte >> 4u) & 0xFu) - 8.0) * d; + row_sum += q_lo * x_block[col][byte_idx]; + row_sum += q_hi * x_block[col][byte_idx + 4u]; + } + acc[col][row] += row_sum; } - acc[row] += row_sum; } } } @@ -139,36 +150,38 @@ fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src #define BLOCK_SIZE_BYTES 20 #define THREADS_PER_BLOCK 4 #define ELEMS_PER_THREAD (BLOCK_SIZE/THREADS_PER_BLOCK) -fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1_idx_base: u32) -> array { - var acc: array; +fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1_idx_base: u32) -> array, NUM_COLS> { + var acc: array, NUM_COLS>; let num_blocks = params.k / BLOCK_SIZE; let thread_within_block = thread_id % THREADS_PER_BLOCK; for (var block = thread_id / THREADS_PER_BLOCK; block < num_blocks; block += WG_SIZE / THREADS_PER_BLOCK) { let x_base = src1_idx_base + block * BLOCK_SIZE + thread_within_block * 4; - var x_block: array; - for (var i = 0u; i < ELEMS_PER_THREAD / 2; i++) { - x_block[i] = f32(src1[x_base + i]); - x_block[i + 4] = f32(src1[x_base + i + 16]); + var x_block: array, NUM_COLS>; + for (var col = 0u; col < NUM_COLS;col += 1) { + for (var i = 0u; i < ELEMS_PER_THREAD / 2; i++) { + x_block[col][i] = f32(src1[x_base + col * params.stride_11 + i]); + x_block[col][i + 4] = f32(src1[x_base + col * params.stride_11 + i + 16]); + } } - for (var row = 0u; row < OUTPUTS_PER_WG; row++) { let output_row = row_base + row; if (output_row < params.m) { let block_byte_base = (src0_batch_offset + output_row * params.stride_01 + block) * BLOCK_SIZE_BYTES; let d = f32(load_f16_at_src0(block_byte_base)); let m = f32(load_f16_at_src0(block_byte_base + 2u)); - var row_sum = 0.0; - let q_packed = load_u32_at_src0(block_byte_base + 4u + 4u * thread_within_block); - for (var byte_idx = 0u; byte_idx < 4u; byte_idx++) { - let q_byte = get_byte(q_packed, byte_idx); - let q_lo = f32(q_byte & 0xFu) * d + m; - let q_hi = f32((q_byte >> 4u) & 0xFu) * d + m; - row_sum += q_lo * x_block[byte_idx]; - row_sum += q_hi * x_block[byte_idx + 4u]; + for (var col = 0u;col < NUM_COLS;col += 1) { + var row_sum = 0.0; + for (var byte_idx = 0u; byte_idx < 4u; byte_idx++) { + let q_byte = get_byte(q_packed, byte_idx); + let q_lo = f32(q_byte & 0xFu) * d + m; + let q_hi = f32((q_byte >> 4u) & 0xFu) * d + m; + row_sum += q_lo * x_block[col][byte_idx]; + row_sum += q_hi * x_block[col][byte_idx + 4u]; + } + acc[col][row] += row_sum; } - acc[row] += row_sum; } } } @@ -182,19 +195,20 @@ fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src #define BLOCK_SIZE_BYTES 22 #define THREADS_PER_BLOCK 4 #define ELEMS_PER_THREAD (BLOCK_SIZE/THREADS_PER_BLOCK) -fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1_idx_base: u32) -> array { - var acc: array; +fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1_idx_base: u32) -> array, NUM_COLS> { + var acc: array, NUM_COLS>; let num_blocks = params.k / BLOCK_SIZE; let thread_within_block = thread_id % THREADS_PER_BLOCK; for (var block = thread_id / THREADS_PER_BLOCK; block < num_blocks; block += WG_SIZE / THREADS_PER_BLOCK) { let x_base = src1_idx_base + block * BLOCK_SIZE + thread_within_block * 4; - var x_block: array; - for (var i = 0u; i < ELEMS_PER_THREAD / 2; i++) { - x_block[i] = f32(src1[x_base + i]); - x_block[i + 4] = f32(src1[x_base + i + 16]); + var x_block: array, NUM_COLS>; + for (var col = 0u; col < NUM_COLS;col += 1) { + for (var i = 0u; i < ELEMS_PER_THREAD / 2; i++) { + x_block[col][i] = f32(src1[x_base + col * params.stride_11 + i]); + x_block[col][i + 4] = f32(src1[x_base + col * params.stride_11 + i + 16]); + } } - for (var row = 0u; row < OUTPUTS_PER_WG; row++) { let output_row = row_base + row; if (output_row < params.m) { @@ -203,18 +217,19 @@ fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src let qh_packed = load_u32_at_src0(block_byte_base + 2u); let q_packed = load_u32_at_src0(block_byte_base + 6u + 4u * thread_within_block); let qh_shift = thread_within_block * 4u; - var row_sum = 0.0; - - for (var byte_idx = 0u; byte_idx < 4u; byte_idx++) { - let q_byte = get_byte(q_packed, byte_idx); - let qh_lo = ((qh_packed >> (qh_shift + byte_idx)) << 4u) & 0x10u; - let qh_hi = (qh_packed >> (qh_shift + byte_idx + 12u)) & 0x10u; - let q_lo = (f32((q_byte & 0xFu) | qh_lo) - 16.0) * d; - let q_hi = (f32(((q_byte >> 4u) & 0xFu) | qh_hi) - 16.0) * d; - row_sum += q_lo * x_block[byte_idx]; - row_sum += q_hi * x_block[byte_idx + 4u]; + for (var col = 0u;col < NUM_COLS;col += 1) { + var row_sum = 0.0; + for (var byte_idx = 0u; byte_idx < 4u; byte_idx++) { + let q_byte = get_byte(q_packed, byte_idx); + let qh_lo = ((qh_packed >> (qh_shift + byte_idx)) << 4u) & 0x10u; + let qh_hi = (qh_packed >> (qh_shift + byte_idx + 12u)) & 0x10u; + let q_lo = (f32((q_byte & 0xFu) | qh_lo) - 16.0) * d; + let q_hi = (f32(((q_byte >> 4u) & 0xFu) | qh_hi) - 16.0) * d; + row_sum += q_lo * x_block[col][byte_idx]; + row_sum += q_hi * x_block[col][byte_idx + 4u]; + } + acc[col][row] += row_sum; } - acc[row] += row_sum; } } } @@ -228,19 +243,20 @@ fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src #define BLOCK_SIZE_BYTES 24 #define THREADS_PER_BLOCK 4 #define ELEMS_PER_THREAD (BLOCK_SIZE/THREADS_PER_BLOCK) -fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1_idx_base: u32) -> array { - var acc: array; +fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1_idx_base: u32) -> array, NUM_COLS> { + var acc: array, NUM_COLS>; let num_blocks = params.k / BLOCK_SIZE; let thread_within_block = thread_id % THREADS_PER_BLOCK; for (var block = thread_id / THREADS_PER_BLOCK; block < num_blocks; block += WG_SIZE / THREADS_PER_BLOCK) { let x_base = src1_idx_base + block * BLOCK_SIZE + thread_within_block * 4; - var x_block: array; - for (var i = 0u; i < ELEMS_PER_THREAD / 2; i++) { - x_block[i] = f32(src1[x_base + i]); - x_block[i + 4] = f32(src1[x_base + i + 16]); + var x_block: array, NUM_COLS>; + for (var col = 0u; col < NUM_COLS;col += 1) { + for (var i = 0u; i < ELEMS_PER_THREAD / 2; i++) { + x_block[col][i] = f32(src1[x_base + col * params.stride_11 + i]); + x_block[col][i + 4] = f32(src1[x_base + col * params.stride_11 + i + 16]); + } } - for (var row = 0u; row < OUTPUTS_PER_WG; row++) { let output_row = row_base + row; if (output_row < params.m) { @@ -250,18 +266,19 @@ fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src let qh_packed = load_u32_at_src0(block_byte_base + 4u); let q_packed = load_u32_at_src0(block_byte_base + 8u + 4u * thread_within_block); let qh_shift = thread_within_block * 4u; - var row_sum = 0.0; - - for (var byte_idx = 0u; byte_idx < 4u; byte_idx++) { - let q_byte = get_byte(q_packed, byte_idx); - let qh_lo = ((qh_packed >> (qh_shift + byte_idx)) << 4u) & 0x10u; - let qh_hi = (qh_packed >> (qh_shift + byte_idx + 12u)) & 0x10u; - let q_lo = f32((q_byte & 0xFu) | qh_lo) * d + m; - let q_hi = f32(((q_byte >> 4u) & 0xFu) | qh_hi) * d + m; - row_sum += q_lo * x_block[byte_idx]; - row_sum += q_hi * x_block[byte_idx + 4u]; + for (var col = 0u;col < NUM_COLS;col += 1) { + var row_sum = 0.0; + for (var byte_idx = 0u; byte_idx < 4u; byte_idx++) { + let q_byte = get_byte(q_packed, byte_idx); + let qh_lo = ((qh_packed >> (qh_shift + byte_idx)) << 4u) & 0x10u; + let qh_hi = (qh_packed >> (qh_shift + byte_idx + 12u)) & 0x10u; + let q_lo = f32((q_byte & 0xFu) | qh_lo) * d + m; + let q_hi = f32(((q_byte >> 4u) & 0xFu) | qh_hi) * d + m; + row_sum += q_lo * x_block[col][byte_idx]; + row_sum += q_hi * x_block[col][byte_idx + 4u]; + } + acc[col][row] += row_sum; } - acc[row] += row_sum; } } } @@ -275,33 +292,38 @@ fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src #define BLOCK_SIZE_BYTES 34 #define THREADS_PER_BLOCK 4 #define ELEMS_PER_THREAD (BLOCK_SIZE/THREADS_PER_BLOCK) -fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1_idx_base: u32) -> array { - var acc: array; +fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1_idx_base: u32) -> array, NUM_COLS> { + var acc: array, NUM_COLS>; let num_blocks = params.k / BLOCK_SIZE; let thread_within_block = thread_id % THREADS_PER_BLOCK; for (var block = thread_id / THREADS_PER_BLOCK; block < num_blocks; block += WG_SIZE / THREADS_PER_BLOCK) { let x_base = src1_idx_base + block * BLOCK_SIZE + thread_within_block * ELEMS_PER_THREAD; - var x_block: array; - for (var i = 0u; i < ELEMS_PER_THREAD; i++) { - x_block[i] = f32(src1[x_base + i]); + var x_block: array, NUM_COLS>; + for (var col = 0u; col < NUM_COLS;col += 1) { + for (var i = 0u; i < ELEMS_PER_THREAD; i++) { + x_block[col][i] = f32(src1[x_base + col * params.stride_11 + i]); + } } - for (var row = 0u; row < OUTPUTS_PER_WG; row++) { let output_row = row_base + row; if (output_row < params.m) { let block_byte_base = (src0_batch_offset + output_row * params.stride_01 + block) * BLOCK_SIZE_BYTES; let d = f32(load_f16_at_src0(block_byte_base)); - var row_sum = 0.0; - + var q_packed: array; for (var packed_idx = 0u; packed_idx < ELEMS_PER_THREAD / 4u; packed_idx++) { - let q_packed = load_u32_at_src0(block_byte_base + 2u + 4u * (thread_within_block * 2u + packed_idx)); - for (var byte_idx = 0u; byte_idx < 4u; byte_idx++) { - let q_val = f32(get_byte_i32(q_packed, byte_idx)) * d; - row_sum += q_val * x_block[packed_idx * 4u + byte_idx]; + q_packed[packed_idx] = load_u32_at_src0(block_byte_base + 2u + 4u * (thread_within_block * 2u + packed_idx)); + } + for (var col = 0u;col < NUM_COLS;col += 1) { + var row_sum = 0.0; + for (var packed_idx = 0u; packed_idx < ELEMS_PER_THREAD / 4u; packed_idx++) { + for (var byte_idx = 0u; byte_idx < 4u; byte_idx++) { + let q_val = f32(get_byte_i32(q_packed[packed_idx], byte_idx)) * d; + row_sum += q_val * x_block[col][packed_idx * 4u + byte_idx]; + } } + acc[col][row] += row_sum; } - acc[row] += row_sum; } } } @@ -315,34 +337,39 @@ fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src #define BLOCK_SIZE_BYTES 36 #define THREADS_PER_BLOCK 4 #define ELEMS_PER_THREAD (BLOCK_SIZE/THREADS_PER_BLOCK) -fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1_idx_base: u32) -> array { - var acc: array; +fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1_idx_base: u32) -> array, NUM_COLS> { + var acc: array, NUM_COLS>; let num_blocks = params.k / BLOCK_SIZE; let thread_within_block = thread_id % THREADS_PER_BLOCK; for (var block = thread_id / THREADS_PER_BLOCK; block < num_blocks; block += WG_SIZE / THREADS_PER_BLOCK) { let x_base = src1_idx_base + block * BLOCK_SIZE + thread_within_block * ELEMS_PER_THREAD; - var x_block: array; - for (var i = 0u; i < ELEMS_PER_THREAD; i++) { - x_block[i] = f32(src1[x_base + i]); + var x_block: array, NUM_COLS>; + for (var col = 0u; col < NUM_COLS;col += 1) { + for (var i = 0u; i < ELEMS_PER_THREAD; i++) { + x_block[col][i] = f32(src1[x_base + col * params.stride_11 + i]); + } } - for (var row = 0u; row < OUTPUTS_PER_WG; row++) { let output_row = row_base + row; if (output_row < params.m) { let block_byte_base = (src0_batch_offset + output_row * params.stride_01 + block) * BLOCK_SIZE_BYTES; let d = f32(load_f16_at_src0(block_byte_base)); let m = f32(load_f16_at_src0(block_byte_base + 2u)); - var row_sum = 0.0; - + var q_packed: array; for (var packed_idx = 0u; packed_idx < ELEMS_PER_THREAD / 4u; packed_idx++) { - let q_packed = load_u32_at_src0(block_byte_base + 4u + 4u * (thread_within_block * 2u + packed_idx)); - for (var byte_idx = 0u; byte_idx < 4u; byte_idx++) { - let q_val = f32(get_byte_i32(q_packed, byte_idx)) * d + m; - row_sum += q_val * x_block[packed_idx * 4u + byte_idx]; + q_packed[packed_idx] = load_u32_at_src0(block_byte_base + 4u + 4u * (thread_within_block * 2u + packed_idx)); + } + for (var col = 0u;col < NUM_COLS;col += 1) { + var row_sum = 0.0; + for (var packed_idx = 0u; packed_idx < ELEMS_PER_THREAD / 4u; packed_idx++) { + for (var byte_idx = 0u; byte_idx < 4u; byte_idx++) { + let q_val = f32(get_byte_i32(q_packed[packed_idx], byte_idx)) * d + m; + row_sum += q_val * x_block[col][packed_idx * 4u + byte_idx]; + } } + acc[col][row] += row_sum; } - acc[row] += row_sum; } } } @@ -355,8 +382,8 @@ fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src #define BLOCK_SIZE 256 #define BLOCK_SIZE_BYTES 84 #define THREADS_PER_BLOCK 16 -fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1_idx_base: u32) -> array { - var acc: array; +fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1_idx_base: u32) -> array, NUM_COLS> { + var acc: array, NUM_COLS>; let tid = thread_id % THREADS_PER_BLOCK; let block_group = thread_id / THREADS_PER_BLOCK; @@ -379,14 +406,15 @@ fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src for (var block = block_group; block < num_blocks; block += num_block_groups) { let x_base = src1_idx_base + block * BLOCK_SIZE + y_offset; - var x_block: array; - for (var i = 0u; i < 4u; i++) { - x_block[i] = f32(src1[x_base + i]); - x_block[i + 4u] = f32(src1[x_base + 32u + i]); - x_block[i + 8u] = f32(src1[x_base + 64u + i]); - x_block[i + 12u] = f32(src1[x_base + 96u + i]); + var x_block: array, NUM_COLS>; + for (var col = 0u; col < NUM_COLS;col += 1) { + for (var i = 0u; i < 4u; i++) { + x_block[col][i] = f32(src1[x_base + col * params.stride_11 + i]); + x_block[col][i + 4u] = f32(src1[x_base + col * params.stride_11 + 32u + i]); + x_block[col][i + 8u] = f32(src1[x_base + col * params.stride_11 + 64u + i]); + x_block[col][i + 12u] = f32(src1[x_base + col * params.stride_11 + 96u + i]); + } } - for (var row = 0u; row < OUTPUTS_PER_WG; row++) { let output_row = row_base + row; if (output_row < params.m) { @@ -404,30 +432,32 @@ fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src let qs0 = q_u32 & 0xFFFFu; let qs1 = q_u32 >> 16u; - var sumy = vec4(0.0, 0.0, 0.0, 0.0); - var acc1 = vec4(0.0, 0.0, 0.0, 0.0); - var acc2 = vec4(0.0, 0.0, 0.0, 0.0); - - sumy[0] = x_block[0] + x_block[1] + x_block[2] + x_block[3]; - sumy[1] = x_block[4] + x_block[5] + x_block[6] + x_block[7]; - sumy[2] = x_block[8] + x_block[9] + x_block[10] + x_block[11]; - sumy[3] = x_block[12] + x_block[13] + x_block[14] + x_block[15]; - - acc1[0] = x_block[0] * f32(qs0 & 0x0003u) + x_block[2] * f32(qs1 & 0x0003u); - acc2[0] = x_block[1] * f32(qs0 & 0x0300u) + x_block[3] * f32(qs1 & 0x0300u); - acc1[1] = x_block[4] * f32(qs0 & 0x000Cu) + x_block[6] * f32(qs1 & 0x000Cu); - acc2[1] = x_block[5] * f32(qs0 & 0x0C00u) + x_block[7] * f32(qs1 & 0x0C00u); - acc1[2] = x_block[8] * f32(qs0 & 0x0030u) + x_block[10] * f32(qs1 & 0x0030u); - acc2[2] = x_block[9] * f32(qs0 & 0x3000u) + x_block[11] * f32(qs1 & 0x3000u); - acc1[3] = x_block[12] * f32(qs0 & 0x00C0u) + x_block[14] * f32(qs1 & 0x00C0u); - acc2[3] = x_block[13] * f32(qs0 & 0xC000u) + x_block[15] * f32(qs1 & 0xC000u); - - acc[row] += dall * ((acc1[0] + (1.0/256.0) * acc2[0]) * f32(sc0 & 0xFu) + - (acc1[1] + (1.0/256.0) * acc2[1]) * f32(sc2 & 0xFu) / 4.0 + - (acc1[2] + (1.0/256.0) * acc2[2]) * f32(sc4 & 0xFu) / 16.0 + - (acc1[3] + (1.0/256.0) * acc2[3]) * f32(sc6 & 0xFu) / 64.0) - - dmin * (sumy[0] * f32(sc0 & 0xF0u) + sumy[1] * f32(sc2 & 0xF0u) + - sumy[2] * f32(sc4 & 0xF0u) + sumy[3] * f32(sc6 & 0xF0u)); + for (var col = 0u;col < NUM_COLS;col += 1) { + var sumy = vec4(0.0, 0.0, 0.0, 0.0); + var acc1 = vec4(0.0, 0.0, 0.0, 0.0); + var acc2 = vec4(0.0, 0.0, 0.0, 0.0); + + sumy[0] = x_block[col][0] + x_block[col][1] + x_block[col][2] + x_block[col][3]; + sumy[1] = x_block[col][4] + x_block[col][5] + x_block[col][6] + x_block[col][7]; + sumy[2] = x_block[col][8] + x_block[col][9] + x_block[col][10] + x_block[col][11]; + sumy[3] = x_block[col][12] + x_block[col][13] + x_block[col][14] + x_block[col][15]; + + acc1[0] = x_block[col][0] * f32(qs0 & 0x0003u) + x_block[col][2] * f32(qs1 & 0x0003u); + acc2[0] = x_block[col][1] * f32(qs0 & 0x0300u) + x_block[col][3] * f32(qs1 & 0x0300u); + acc1[1] = x_block[col][4] * f32(qs0 & 0x000Cu) + x_block[col][6] * f32(qs1 & 0x000Cu); + acc2[1] = x_block[col][5] * f32(qs0 & 0x0C00u) + x_block[col][7] * f32(qs1 & 0x0C00u); + acc1[2] = x_block[col][8] * f32(qs0 & 0x0030u) + x_block[col][10] * f32(qs1 & 0x0030u); + acc2[2] = x_block[col][9] * f32(qs0 & 0x3000u) + x_block[col][11] * f32(qs1 & 0x3000u); + acc1[3] = x_block[col][12] * f32(qs0 & 0x00C0u) + x_block[col][14] * f32(qs1 & 0x00C0u); + acc2[3] = x_block[col][13] * f32(qs0 & 0xC000u) + x_block[col][15] * f32(qs1 & 0xC000u); + + acc[col][row] += dall * ((acc1[0] + (1.0/256.0) * acc2[0]) * f32(sc0 & 0xFu) + + (acc1[1] + (1.0/256.0) * acc2[1]) * f32(sc2 & 0xFu) / 4.0 + + (acc1[2] + (1.0/256.0) * acc2[2]) * f32(sc4 & 0xFu) / 16.0 + + (acc1[3] + (1.0/256.0) * acc2[3]) * f32(sc6 & 0xFu) / 64.0) + - dmin * (sumy[0] * f32(sc0 & 0xF0u) + sumy[1] * f32(sc2 & 0xF0u) + + sumy[2] * f32(sc4 & 0xF0u) + sumy[3] * f32(sc6 & 0xF0u)); + } } } } @@ -440,8 +470,8 @@ fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src #define BLOCK_SIZE 256 #define BLOCK_SIZE_BYTES 110 #define THREADS_PER_BLOCK 16 -fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1_idx_base: u32) -> array { - var acc: array; +fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1_idx_base: u32) -> array, NUM_COLS> { + var acc: array, NUM_COLS>; let tid = thread_id % THREADS_PER_BLOCK; let block_group = thread_id / THREADS_PER_BLOCK; @@ -485,12 +515,13 @@ fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src for (var block = block_group; block < num_blocks; block += num_block_groups) { let x_base = src1_idx_base + block * BLOCK_SIZE + y_offset; - var x_block: array; - for (var i = 0u; i < 8u; i++) { - x_block[i] = f32(src1[x_base + i]); - x_block[i + 8u] = f32(src1[x_base + 32u + i]); + var x_block: array, NUM_COLS>; + for (var col = 0u; col < NUM_COLS;col += 1) { + for (var i = 0u; i < 8u; i++) { + x_block[col][i] = f32(src1[x_base + col * params.stride_11 + i]); + x_block[col][i + 8u] = f32(src1[x_base + col * params.stride_11 + 32u + i]); + } } - for (var row = 0u; row < OUTPUTS_PER_WG; row++) { let output_row = row_base + row; if (output_row < params.m) { @@ -516,28 +547,30 @@ fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src let h_u32_0 = load_u32_at_src0(block_byte_base + h_byte + 0u); let h_u32_1 = load_u32_at_src0(block_byte_base + h_byte + 4u); - var s1 = 0.0; var s2 = 0.0; var s3 = 0.0; - var s4 = 0.0; var s5 = 0.0; var s6 = 0.0; - - for (var l = 0u; l < 8u; l += 2u) { - let q_u32 = select(q_u32_0, q_u32_1, l >= 4u); - let qs = select(q_u32 & 0xFFFFu, q_u32 >> 16u, (l & 2u) != 0u); - let h_u32 = select(h_u32_0, h_u32_1, l >= 4u); - let hv = select(h_u32 & 0xFFFFu, h_u32 >> 16u, (l & 2u) != 0u); - - s1 += x_block[l + 0u] * f32(qs & qm0); - s2 += x_block[l + 1u] * f32(qs & qm1); - s3 += select(0.0, x_block[l + 0u], (hv & hm0) == 0u) + - select(0.0, x_block[l + 1u], (hv & hm1) == 0u); - s4 += x_block[l + 8u] * f32(qs & qm2); - s5 += x_block[l + 9u] * f32(qs & qm3); - s6 += select(0.0, x_block[l + 8u], (hv & hm2) == 0u) + - select(0.0, x_block[l + 9u], (hv & hm3) == 0u); - } + for (var col = 0u;col < NUM_COLS;col += 1) { + var s1 = 0.0; var s2 = 0.0; var s3 = 0.0; + var s4 = 0.0; var s5 = 0.0; var s6 = 0.0; + + for (var l = 0u; l < 8u; l += 2u) { + let q_u32 = select(q_u32_0, q_u32_1, l >= 4u); + let qs = select(q_u32 & 0xFFFFu, q_u32 >> 16u, (l & 2u) != 0u); + let h_u32 = select(h_u32_0, h_u32_1, l >= 4u); + let hv = select(h_u32 & 0xFFFFu, h_u32 >> 16u, (l & 2u) != 0u); + + s1 += x_block[col][l + 0u] * f32(qs & qm0); + s2 += x_block[col][l + 1u] * f32(qs & qm1); + s3 += select(0.0, x_block[col][l + 0u], (hv & hm0) == 0u) + + select(0.0, x_block[col][l + 1u], (hv & hm1) == 0u); + s4 += x_block[col][l + 8u] * f32(qs & qm2); + s5 += x_block[col][l + 9u] * f32(qs & qm3); + s6 += select(0.0, x_block[col][l + 8u], (hv & hm2) == 0u) + + select(0.0, x_block[col][l + 9u], (hv & hm3) == 0u); + } - let d1 = d * (s1 + (1.0/256.0) * s2 - s3 * v1); - let d2 = d * (s4 + (1.0/256.0) * s5 - s6 * v2); - acc[row] += (d1 * scale0 + 0.25 * d2 * scale1) / f32(1u << shift); + let d1 = d * (s1 + (1.0/256.0) * s2 - s3 * v1); + let d2 = d * (s4 + (1.0/256.0) * s5 - s6 * v2); + acc[col][row] += (d1 * scale0 + 0.25 * d2 * scale1) / f32(1u << shift); + } } } } @@ -550,8 +583,8 @@ fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src #define BLOCK_SIZE 256 #define BLOCK_SIZE_BYTES 144 #define THREADS_PER_BLOCK 16 -fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1_idx_base: u32) -> array { - var acc: array; +fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1_idx_base: u32) -> array, NUM_COLS> { + var acc: array, NUM_COLS>; let tid = thread_id % THREADS_PER_BLOCK; let block_group = thread_id / THREADS_PER_BLOCK; @@ -573,12 +606,15 @@ fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src for (var block = block_group; block < num_blocks; block += num_block_groups) { let x_base = src1_idx_base + block * BLOCK_SIZE + y_offset; - var x_block: array; - for (var i = 0u; i < 4u; i++) { - x_block[i] = f32(src1[x_base + i]); - x_block[i + 4u] = f32(src1[x_base + 32u + i]); - x_block[i + 8u] = f32(src1[x_base + 128u + i]); - x_block[i + 12u] = f32(src1[x_base + 160u + i]); + var x_block: array, NUM_COLS>; + for (var col = 0u; col < NUM_COLS;col += 1) { + let col_base = x_base + col * params.stride_11; + for (var i = 0u; i < 4u; i++) { + x_block[col][i] = f32(src1[col_base + i]); + x_block[col][i + 4u] = f32(src1[col_base + 32u + i]); + x_block[col][i + 8u] = f32(src1[col_base + 128u + i]); + x_block[col][i + 12u] = f32(src1[col_base + 160u + i]); + } } for (var row = 0u; row < OUTPUTS_PER_WG; row++) { @@ -613,23 +649,25 @@ fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src let q1_u32 = load_u32_at_src0_aligned(block_byte_base + 16u + q_offset); let q2_u32 = load_u32_at_src0_aligned(block_byte_base + 80u + q_offset); - var dot = vec4(0.0, 0.0, 0.0, 0.0); - var sumx = vec4(0.0, 0.0, 0.0, 0.0); - for (var i = 0u; i < 4u; i++) { - let q1b = byte_of(q1_u32, i); - let q2b = byte_of(q2_u32, i); - dot[0] += x_block[i] * f32(q1b & 0x0Fu); - dot[1] += x_block[i + 4u] * f32(q1b >> 4u); - dot[2] += x_block[i + 8u] * f32(q2b & 0x0Fu); - dot[3] += x_block[i + 12u] * f32(q2b >> 4u); - sumx[0] += x_block[i]; - sumx[1] += x_block[i + 4u]; - sumx[2] += x_block[i + 8u]; - sumx[3] += x_block[i + 12u]; - } + for (var col = 0u;col < NUM_COLS;col += 1) { + var dot = vec4(0.0, 0.0, 0.0, 0.0); + var sumx = vec4(0.0, 0.0, 0.0, 0.0); + for (var i = 0u; i < 4u; i++) { + let q1b = byte_of(q1_u32, i); + let q2b = byte_of(q2_u32, i); + dot[0] += x_block[col][i] * f32(q1b & 0x0Fu); + dot[1] += x_block[col][i + 4u] * f32(q1b >> 4u); + dot[2] += x_block[col][i + 8u] * f32(q2b & 0x0Fu); + dot[3] += x_block[col][i + 12u] * f32(q2b >> 4u); + sumx[0] += x_block[col][i]; + sumx[1] += x_block[col][i + 4u]; + sumx[2] += x_block[col][i + 8u]; + sumx[3] += x_block[col][i + 12u]; + } - acc[row] += d * (dot[0] * scale0 + dot[1] * scale1 + dot[2] * scale2 + dot[3] * scale3) - - dmin * (sumx[0] * min0 + sumx[1] * min1 + sumx[2] * min2 + sumx[3] * min3); + acc[col][row] += d * (dot[0] * scale0 + dot[1] * scale1 + dot[2] * scale2 + dot[3] * scale3) + - dmin * (sumx[0] * min0 + sumx[1] * min1 + sumx[2] * min2 + sumx[3] * min3); + } } } } @@ -642,8 +680,8 @@ fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src #define BLOCK_SIZE 256 #define BLOCK_SIZE_BYTES 176 #define THREADS_PER_BLOCK 16 -fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1_idx_base: u32) -> array { - var acc: array; +fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1_idx_base: u32) -> array, NUM_COLS> { + var acc: array, NUM_COLS>; let tid = thread_id % THREADS_PER_BLOCK; let block_group = thread_id / THREADS_PER_BLOCK; @@ -671,14 +709,16 @@ fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src for (var block = block_group; block < num_blocks; block += num_block_groups) { let x_base = src1_idx_base + block * BLOCK_SIZE + y_offset; - var x_block: array; - for (var i = 0u; i < 4u; i++) { - x_block[i] = f32(src1[x_base + i]); - x_block[i + 4u] = f32(src1[x_base + 32u + i]); - x_block[i + 8u] = f32(src1[x_base + 128u + i]); - x_block[i + 12u] = f32(src1[x_base + 160u + i]); + var x_block: array, NUM_COLS>; + for (var col = 0u; col < NUM_COLS;col += 1) { + let col_base = x_base + col * params.stride_11; + for (var i = 0u; i < 4u; i++) { + x_block[col][i] = f32(src1[col_base + i]); + x_block[col][i + 4u] = f32(src1[col_base + 32u + i]); + x_block[col][i + 8u] = f32(src1[col_base + 128u + i]); + x_block[col][i + 12u] = f32(src1[col_base + 160u + i]); + } } - for (var row = 0u; row < OUTPUTS_PER_WG; row++) { let output_row = row_base + row; if (output_row < params.m) { @@ -712,37 +752,39 @@ fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src let q2_u32 = load_u32_at_src0_aligned(block_byte_base + q_offset + 64u); let qh_u32 = load_u32_at_src0_aligned(block_byte_base + qh_offset); - var vals = vec4(0.0, 0.0, 0.0, 0.0); - var sumy = vec4(0.0, 0.0, 0.0, 0.0); - for (var i = 0u; i < 4u; i++) { - let q1b = byte_of(q1_u32, i); - let q2b = byte_of(q2_u32, i); - let qhb = byte_of(qh_u32, i); - - let yl0 = x_block[i]; - let yl8 = x_block[i + 4u]; - let yh0 = x_block[i + 8u]; - let yh8 = x_block[i + 12u]; - - sumy[0] += yl0; - sumy[1] += yl8; - sumy[2] += yh0; - sumy[3] += yh8; - - let q0 = f32((q1b & 0x0Fu) | select(0u, 0x10u, (qhb & hm1) != 0u)); - let q1 = f32((q1b >> 4u) | select(0u, 0x10u, (qhb & hm2) != 0u)); - let q2 = f32((q2b & 0x0Fu) | select(0u, 0x10u, (qhb & hm3) != 0u)); - let q3 = f32((q2b >> 4u) | select(0u, 0x10u, (qhb & hm4) != 0u)); - - vals[0] += yl0 * q0; - vals[1] += yl8 * q1; - vals[2] += yh0 * q2; - vals[3] += yh8 * q3; - } + for (var col = 0u;col < NUM_COLS;col += 1) { + var vals = vec4(0.0, 0.0, 0.0, 0.0); + var sumy = vec4(0.0, 0.0, 0.0, 0.0); + for (var i = 0u; i < 4u; i++) { + let q1b = byte_of(q1_u32, i); + let q2b = byte_of(q2_u32, i); + let qhb = byte_of(qh_u32, i); + + let yl0 = x_block[col][i]; + let yl8 = x_block[col][i + 4u]; + let yh0 = x_block[col][i + 8u]; + let yh8 = x_block[col][i + 12u]; + + sumy[0] += yl0; + sumy[1] += yl8; + sumy[2] += yh0; + sumy[3] += yh8; + + let q0 = f32((q1b & 0x0Fu) | select(0u, 0x10u, (qhb & hm1) != 0u)); + let q1 = f32((q1b >> 4u) | select(0u, 0x10u, (qhb & hm2) != 0u)); + let q2 = f32((q2b & 0x0Fu) | select(0u, 0x10u, (qhb & hm3) != 0u)); + let q3 = f32((q2b >> 4u) | select(0u, 0x10u, (qhb & hm4) != 0u)); + + vals[0] += yl0 * q0; + vals[1] += yl8 * q1; + vals[2] += yh0 * q2; + vals[3] += yh8 * q3; + } - acc[row] += d * (f0 * vals[0] + f1 * vals[1] + f4 * vals[2] + f5 * vals[3]) - - dmin * (sumy[0] * m0 + sumy[1] * m1 + - sumy[2] * m4 + sumy[3] * m5); + acc[col][row] += d * (f0 * vals[0] + f1 * vals[1] + f4 * vals[2] + f5 * vals[3]) + - dmin * (sumy[0] * m0 + sumy[1] * m1 + + sumy[2] * m4 + sumy[3] * m5); + } } } } @@ -755,8 +797,8 @@ fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src #define BLOCK_SIZE 256 #define BLOCK_SIZE_BYTES 210 #define THREADS_PER_BLOCK 16 -fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1_idx_base: u32) -> array { - var acc: array; +fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1_idx_base: u32) -> array, NUM_COLS> { + var acc: array, NUM_COLS>; let tid = thread_id % THREADS_PER_BLOCK; let block_group = thread_id / THREADS_PER_BLOCK; @@ -777,14 +819,16 @@ fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src for (var block = block_group; block < num_blocks; block += num_block_groups) { let x_base = src1_idx_base + block * BLOCK_SIZE + y_offset; - var x_block: array; - for (var l = 0u; l < 4u; l++) { - x_block[l] = f32(src1[x_base + l]); - x_block[l + 4u] = f32(src1[x_base + 32u + l]); - x_block[l + 8u] = f32(src1[x_base + 64u + l]); - x_block[l + 12u] = f32(src1[x_base + 96u + l]); + var x_block: array, NUM_COLS>; + for (var col = 0u; col < NUM_COLS;col += 1) { + let col_base = x_base + col * params.stride_11; + for (var l = 0u; l < 4u; l++) { + x_block[col][l] = f32(src1[col_base + l]); + x_block[col][l + 4u] = f32(src1[col_base + 32u + l]); + x_block[col][l + 8u] = f32(src1[col_base + 64u + l]); + x_block[col][l + 12u] = f32(src1[col_base + 96u + l]); + } } - for (var row = 0u; row < OUTPUTS_PER_WG; row++) { let output_row = row_base + row; if (output_row < params.m) { @@ -802,26 +846,28 @@ fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src let sc4 = sbyte_of(sc_u32_1, sc_byte_pos); let sc6 = sbyte_of(sc_u32_1, sc_byte_pos + 2u); - var sums = vec4(0.0, 0.0, 0.0, 0.0); + for (var col = 0u;col < NUM_COLS;col += 1) { + var sums = vec4(0.0, 0.0, 0.0, 0.0); - for (var l = 0u; l < 4u; l++) { - let q1b = byte_of(ql1_u32, l); - let q2b = byte_of(ql2_u32, l); - let qhb = byte_of(qh_u32, l); + for (var l = 0u; l < 4u; l++) { + let q1b = byte_of(ql1_u32, l); + let q2b = byte_of(ql2_u32, l); + let qhb = byte_of(qh_u32, l); - let dq0 = f32(i32((q1b & 0x0Fu) | ((qhb & 0x03u) << 4u)) - 32); - let dq1 = f32(i32((q2b & 0x0Fu) | ((qhb & 0x0Cu) << 2u)) - 32); - let dq2 = f32(i32((q1b >> 4u) | (qhb & 0x30u)) - 32); - let dq3 = f32(i32((q2b >> 4u) | ((qhb & 0xC0u) >> 2u)) - 32); + let dq0 = f32(i32((q1b & 0x0Fu) | ((qhb & 0x03u) << 4u)) - 32); + let dq1 = f32(i32((q2b & 0x0Fu) | ((qhb & 0x0Cu) << 2u)) - 32); + let dq2 = f32(i32((q1b >> 4u) | (qhb & 0x30u)) - 32); + let dq3 = f32(i32((q2b >> 4u) | ((qhb & 0xC0u) >> 2u)) - 32); - sums[0] += x_block[l] * dq0; - sums[1] += x_block[l + 4u] * dq1; - sums[2] += x_block[l + 8u] * dq2; - sums[3] += x_block[l + 12u] * dq3; - } + sums[0] += x_block[col][l] * dq0; + sums[1] += x_block[col][l + 4u] * dq1; + sums[2] += x_block[col][l + 8u] * dq2; + sums[3] += x_block[col][l + 12u] * dq3; + } - acc[row] += d * (sums[0] * f32(sc0) + sums[1] * f32(sc2) + - sums[2] * f32(sc4) + sums[3] * f32(sc6)); + acc[col][row] += d * (sums[0] * f32(sc0) + sums[1] * f32(sc2) + + sums[2] * f32(sc4) + sums[3] * f32(sc6)); + } } } } @@ -834,8 +880,8 @@ fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src #define BLOCK_SIZE 256 #define BLOCK_SIZE_BYTES 50 #define THREADS_PER_BLOCK 16 -fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1_idx_base: u32) -> array { - var acc: array; +fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1_idx_base: u32) -> array, NUM_COLS> { + var acc: array, NUM_COLS>; let tid = thread_id % THREADS_PER_BLOCK; let block_group = thread_id / THREADS_PER_BLOCK; @@ -850,11 +896,12 @@ fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src for (var block = block_group; block < num_blocks; block += num_block_groups) { let x_base = src1_idx_base + block * BLOCK_SIZE + y_offset; - var x_block: array; - for (var i = 0u; i < 16u; i++) { - x_block[i] = f32(src1[x_base + i]); + var x_block: array, NUM_COLS>; + for (var col = 0u; col < NUM_COLS;col += 1) { + for (var i = 0u; i < 16u; i++) { + x_block[col][i] = f32(src1[x_base + col * params.stride_11 + i]); + } } - for (var row = 0u; row < OUTPUTS_PER_WG; row++) { let output_row = row_base + row; if (output_row < params.m) { @@ -866,20 +913,22 @@ fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src let delta = select(IQ1_DELTA, -IQ1_DELTA, (qh & 0x8000u) != 0u); let qs_w = load_u32_at_src0(block_byte_base + 2u + sub_blk * 4u); - var row_sum = 0.0; - for (var ll = 0u; ll < 2u; ll++) { - let l = slot0 + ll; - let qs_byte = get_byte(qs_w, l); - let ig = (qs_byte | (((qh >> (3u * l)) & 7u) << 8u)) * 8u; - let gw = iq1_grid[ig / 16u]; - let bit_base = (ig % 16u) * 2u; - for (var j = 0u; j < 8u; j++) { - let g = (gw >> (bit_base + j * 2u)) & 3u; - let gs = select(f32(g), f32(g) - 4.0, (g & 2u) != 0u); - row_sum += dl * (gs + delta) * x_block[ll * 8u + j]; + for (var col = 0u;col < NUM_COLS;col += 1) { + var row_sum = 0.0; + for (var ll = 0u; ll < 2u; ll++) { + let l = slot0 + ll; + let qs_byte = get_byte(qs_w, l); + let ig = (qs_byte | (((qh >> (3u * l)) & 7u) << 8u)) * 8u; + let gw = iq1_grid[ig / 16u]; + let bit_base = (ig % 16u) * 2u; + for (var j = 0u; j < 8u; j++) { + let g = (gw >> (bit_base + j * 2u)) & 3u; + let gs = select(f32(g), f32(g) - 4.0, (g & 2u) != 0u); + row_sum += dl * (gs + delta) * x_block[col][ll * 8u + j]; + } } + acc[col][row] += row_sum; } - acc[row] += row_sum; } } } @@ -892,8 +941,8 @@ fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src #define BLOCK_SIZE 256 #define BLOCK_SIZE_BYTES 56 #define THREADS_PER_BLOCK 16 -fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1_idx_base: u32) -> array { - var acc: array; +fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1_idx_base: u32) -> array, NUM_COLS> { + var acc: array, NUM_COLS>; let tid = thread_id % THREADS_PER_BLOCK; let block_group = thread_id / THREADS_PER_BLOCK; @@ -908,11 +957,12 @@ fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src for (var block = block_group; block < num_blocks; block += num_block_groups) { let x_base = src1_idx_base + block * BLOCK_SIZE + y_offset; - var x_block: array; - for (var i = 0u; i < 16u; i++) { - x_block[i] = f32(src1[x_base + i]); + var x_block: array, NUM_COLS>; + for (var col = 0u; col < NUM_COLS;col += 1) { + for (var i = 0u; i < 16u; i++) { + x_block[col][i] = f32(src1[x_base + col * params.stride_11 + i]); + } } - for (var row = 0u; row < OUTPUTS_PER_WG; row++) { let output_row = row_base + row; if (output_row < params.m) { @@ -936,26 +986,28 @@ fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src let qh_lo = qh & 0xFFu; let qh_hi = (qh >> 8u) & 0xFFu; - var row_sum = 0.0; - for (var ll = 0u; ll < 2u; ll++) { - let l = slot0 + ll; - let bit_off = 6u * (sub_blk % 2u) + 3u * (l / 2u); - let sub_scale = (sc_u16 >> bit_off) & 0x7u; - let dl = d * f32(2u * sub_scale + 1u); - let qh_byte = select(qh_lo, qh_hi, l >= 2u); - let ll2 = l % 2u; - let grid_idx = get_byte(qs_w, l) | (((qh_byte >> (4u * ll2)) & 7u) << 8u); - let delta = select(IQ1_DELTA, -IQ1_DELTA, ((qh_byte >> (3u + 4u * ll2)) & 1u) != 0u); - let ig = grid_idx * 8u; - let gw = iq1_grid[ig / 16u]; - let bit_base = (ig % 16u) * 2u; - for (var j = 0u; j < 8u; j++) { - let g = (gw >> (bit_base + j * 2u)) & 3u; - let gs = select(f32(g), f32(g) - 4.0, (g & 2u) != 0u); - row_sum += dl * (gs + delta) * x_block[ll * 8u + j]; + for (var col = 0u;col < NUM_COLS;col += 1) { + var row_sum = 0.0; + for (var ll = 0u; ll < 2u; ll++) { + let l = slot0 + ll; + let bit_off = 6u * (sub_blk % 2u) + 3u * (l / 2u); + let sub_scale = (sc_u16 >> bit_off) & 0x7u; + let dl = d * f32(2u * sub_scale + 1u); + let qh_byte = select(qh_lo, qh_hi, l >= 2u); + let ll2 = l % 2u; + let grid_idx = get_byte(qs_w, l) | (((qh_byte >> (4u * ll2)) & 7u) << 8u); + let delta = select(IQ1_DELTA, -IQ1_DELTA, ((qh_byte >> (3u + 4u * ll2)) & 1u) != 0u); + let ig = grid_idx * 8u; + let gw = iq1_grid[ig / 16u]; + let bit_base = (ig % 16u) * 2u; + for (var j = 0u; j < 8u; j++) { + let g = (gw >> (bit_base + j * 2u)) & 3u; + let gs = select(f32(g), f32(g) - 4.0, (g & 2u) != 0u); + row_sum += dl * (gs + delta) * x_block[col][ll * 8u + j]; + } } + acc[col][row] += row_sum; } - acc[row] += row_sum; } } } @@ -968,8 +1020,8 @@ fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src #define BLOCK_SIZE 256 #define BLOCK_SIZE_BYTES 66 #define THREADS_PER_BLOCK 16 -fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1_idx_base: u32) -> array { - var acc: array; +fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1_idx_base: u32) -> array, NUM_COLS> { + var acc: array, NUM_COLS>; let tid = thread_id % THREADS_PER_BLOCK; let block_group = thread_id / THREADS_PER_BLOCK; @@ -984,11 +1036,12 @@ fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src for (var block = block_group; block < num_blocks; block += num_block_groups) { let x_base = src1_idx_base + block * BLOCK_SIZE + y_offset; - var x_block: array; - for (var i = 0u; i < 16u; i++) { - x_block[i] = f32(src1[x_base + i]); + var x_block: array, NUM_COLS>; + for (var col = 0u; col < NUM_COLS;col += 1) { + for (var i = 0u; i < 16u; i++) { + x_block[col][i] = f32(src1[x_base + col * params.stride_11 + i]); + } } - for (var row = 0u; row < OUTPUTS_PER_WG; row++) { let output_row = row_base + row; if (output_row < params.m) { @@ -999,22 +1052,24 @@ fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src let ls = aux_hi >> 28u; let db = d * (0.5 + f32(ls)) * 0.25; - var row_sum = 0.0; - for (var ll = 0u; ll < 2u; ll++) { - let l = slot0 + ll; - let grid_idx = (aux_lo >> (8u * l)) & 0xFFu; - let signs_idx = (aux_hi >> (7u * l)) & 0x7Fu; - let signs = (ksigns_iq2xs[signs_idx / 4u] >> ((signs_idx % 4u) * 8u)) & 0xFFu; - let gw_lo = iq2xxs_grid[grid_idx * 2u]; - let gw_hi = iq2xxs_grid[grid_idx * 2u + 1u]; - for (var j = 0u; j < 8u; j++) { - let gw = select(gw_hi, gw_lo, j < 4u); - let b = f32((gw >> ((j & 3u) * 8u)) & 0xFFu); - let s = select(1.0, -1.0, ((signs >> j) & 1u) != 0u); - row_sum += db * b * s * x_block[ll * 8u + j]; + for (var col = 0u;col < NUM_COLS;col += 1) { + var row_sum = 0.0; + for (var ll = 0u; ll < 2u; ll++) { + let l = slot0 + ll; + let grid_idx = (aux_lo >> (8u * l)) & 0xFFu; + let signs_idx = (aux_hi >> (7u * l)) & 0x7Fu; + let signs = (ksigns_iq2xs[signs_idx / 4u] >> ((signs_idx % 4u) * 8u)) & 0xFFu; + let gw_lo = iq2xxs_grid[grid_idx * 2u]; + let gw_hi = iq2xxs_grid[grid_idx * 2u + 1u]; + for (var j = 0u; j < 8u; j++) { + let gw = select(gw_hi, gw_lo, j < 4u); + let b = f32((gw >> ((j & 3u) * 8u)) & 0xFFu); + let s = select(1.0, -1.0, ((signs >> j) & 1u) != 0u); + row_sum += db * b * s * x_block[col][ll * 8u + j]; + } } + acc[col][row] += row_sum; } - acc[row] += row_sum; } } } @@ -1027,8 +1082,8 @@ fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src #define BLOCK_SIZE 256 #define BLOCK_SIZE_BYTES 74 #define THREADS_PER_BLOCK 16 -fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1_idx_base: u32) -> array { - var acc: array; +fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1_idx_base: u32) -> array, NUM_COLS> { + var acc: array, NUM_COLS>; let tid = thread_id % THREADS_PER_BLOCK; let block_group = thread_id / THREADS_PER_BLOCK; @@ -1043,11 +1098,12 @@ fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src for (var block = block_group; block < num_blocks; block += num_block_groups) { let x_base = src1_idx_base + block * BLOCK_SIZE + y_offset; - var x_block: array; - for (var i = 0u; i < 16u; i++) { - x_block[i] = f32(src1[x_base + i]); + var x_block: array, NUM_COLS>; + for (var col = 0u; col < NUM_COLS;col += 1) { + for (var i = 0u; i < 16u; i++) { + x_block[col][i] = f32(src1[x_base + col * params.stride_11 + i]); + } } - for (var row = 0u; row < OUTPUTS_PER_WG; row++) { let output_row = row_base + row; if (output_row < params.m) { @@ -1058,27 +1114,29 @@ fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src let scales_word = load_u32_at_src0(block_byte_base + 66u + (sub_blk / 4u) * 4u); let scales_byte = get_byte(scales_word, sub_blk % 4u); - var row_sum = 0.0; - for (var ll = 0u; ll < 2u; ll++) { - let l = slot0 + ll; - let qs_word = select(qs_hi, qs_lo, l < 2u); - let half2 = (l % 2u) * 16u; - let qs_val = (qs_word >> half2) & 0xFFFFu; - let grid_idx = qs_val & 0x1FFu; - let signs_idx = (qs_val >> 9u) & 0x7Fu; - let sub_scale = (scales_byte >> (4u * (l / 2u))) & 0xFu; - let db = d * (0.5 + f32(sub_scale)) * 0.25; - let signs = (ksigns_iq2xs[signs_idx / 4u] >> ((signs_idx % 4u) * 8u)) & 0xFFu; - let gw_lo = iq2xs_grid[grid_idx * 2u]; - let gw_hi = iq2xs_grid[grid_idx * 2u + 1u]; - for (var j = 0u; j < 8u; j++) { - let gw = select(gw_hi, gw_lo, j < 4u); - let b = f32((gw >> ((j & 3u) * 8u)) & 0xFFu); - let s = select(1.0, -1.0, ((signs >> j) & 1u) != 0u); - row_sum += db * b * s * x_block[ll * 8u + j]; + for (var col = 0u;col < NUM_COLS;col += 1) { + var row_sum = 0.0; + for (var ll = 0u; ll < 2u; ll++) { + let l = slot0 + ll; + let qs_word = select(qs_hi, qs_lo, l < 2u); + let half2 = (l % 2u) * 16u; + let qs_val = (qs_word >> half2) & 0xFFFFu; + let grid_idx = qs_val & 0x1FFu; + let signs_idx = (qs_val >> 9u) & 0x7Fu; + let sub_scale = (scales_byte >> (4u * (l / 2u))) & 0xFu; + let db = d * (0.5 + f32(sub_scale)) * 0.25; + let signs = (ksigns_iq2xs[signs_idx / 4u] >> ((signs_idx % 4u) * 8u)) & 0xFFu; + let gw_lo = iq2xs_grid[grid_idx * 2u]; + let gw_hi = iq2xs_grid[grid_idx * 2u + 1u]; + for (var j = 0u; j < 8u; j++) { + let gw = select(gw_hi, gw_lo, j < 4u); + let b = f32((gw >> ((j & 3u) * 8u)) & 0xFFu); + let s = select(1.0, -1.0, ((signs >> j) & 1u) != 0u); + row_sum += db * b * s * x_block[col][ll * 8u + j]; + } } + acc[col][row] += row_sum; } - acc[row] += row_sum; } } } @@ -1091,8 +1149,8 @@ fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src #define BLOCK_SIZE 256 #define BLOCK_SIZE_BYTES 82 #define THREADS_PER_BLOCK 16 -fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1_idx_base: u32) -> array { - var acc: array; +fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1_idx_base: u32) -> array, NUM_COLS> { + var acc: array, NUM_COLS>; let tid = thread_id % THREADS_PER_BLOCK; let block_group = thread_id / THREADS_PER_BLOCK; @@ -1107,11 +1165,12 @@ fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src for (var block = block_group; block < num_blocks; block += num_block_groups) { let x_base = src1_idx_base + block * BLOCK_SIZE + y_offset; - var x_block: array; - for (var i = 0u; i < 16u; i++) { - x_block[i] = f32(src1[x_base + i]); + var x_block: array, NUM_COLS>; + for (var col = 0u; col < NUM_COLS;col += 1) { + for (var i = 0u; i < 16u; i++) { + x_block[col][i] = f32(src1[x_base + col * params.stride_11 + i]); + } } - for (var row = 0u; row < OUTPUTS_PER_WG; row++) { let output_row = row_base + row; if (output_row < params.m) { @@ -1124,24 +1183,26 @@ fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src let sc_word = load_u32_at_src0(block_byte_base + 74u + (sub_blk / 4u) * 4u); let scales_byte = get_byte(sc_word, sub_blk % 4u); - var row_sum = 0.0; - for (var ll = 0u; ll < 2u; ll++) { - let l = slot0 + ll; - let qs_byte = get_byte(qs_w, l); - let sign_byte = get_byte(sg_w, l); - let grid_idx = qs_byte | (((qh_byte >> (2u * l)) & 3u) << 8u); - let sub_scale = (scales_byte >> (4u * (l / 2u))) & 0xFu; - let db = d * (0.5 + f32(sub_scale)) * 0.25; - let gw_lo = iq2s_grid[grid_idx * 2u]; - let gw_hi = iq2s_grid[grid_idx * 2u + 1u]; - for (var j = 0u; j < 8u; j++) { - let gw = select(gw_hi, gw_lo, j < 4u); - let b = f32((gw >> ((j & 3u) * 8u)) & 0xFFu); - let s = select(1.0, -1.0, ((sign_byte >> j) & 1u) != 0u); - row_sum += db * b * s * x_block[ll * 8u + j]; + for (var col = 0u;col < NUM_COLS;col += 1) { + var row_sum = 0.0; + for (var ll = 0u; ll < 2u; ll++) { + let l = slot0 + ll; + let qs_byte = get_byte(qs_w, l); + let sign_byte = get_byte(sg_w, l); + let grid_idx = qs_byte | (((qh_byte >> (2u * l)) & 3u) << 8u); + let sub_scale = (scales_byte >> (4u * (l / 2u))) & 0xFu; + let db = d * (0.5 + f32(sub_scale)) * 0.25; + let gw_lo = iq2s_grid[grid_idx * 2u]; + let gw_hi = iq2s_grid[grid_idx * 2u + 1u]; + for (var j = 0u; j < 8u; j++) { + let gw = select(gw_hi, gw_lo, j < 4u); + let b = f32((gw >> ((j & 3u) * 8u)) & 0xFFu); + let s = select(1.0, -1.0, ((sign_byte >> j) & 1u) != 0u); + row_sum += db * b * s * x_block[col][ll * 8u + j]; + } } + acc[col][row] += row_sum; } - acc[row] += row_sum; } } } @@ -1154,8 +1215,8 @@ fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src #define BLOCK_SIZE 256 #define BLOCK_SIZE_BYTES 98 #define THREADS_PER_BLOCK 16 -fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1_idx_base: u32) -> array { - var acc: array; +fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1_idx_base: u32) -> array, NUM_COLS> { + var acc: array, NUM_COLS>; let tid = thread_id % THREADS_PER_BLOCK; let block_group = thread_id / THREADS_PER_BLOCK; @@ -1170,11 +1231,12 @@ fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src for (var block = block_group; block < num_blocks; block += num_block_groups) { let x_base = src1_idx_base + block * BLOCK_SIZE + y_offset; - var x_block: array; - for (var i = 0u; i < 16u; i++) { - x_block[i] = f32(src1[x_base + i]); + var x_block: array, NUM_COLS>; + for (var col = 0u; col < NUM_COLS;col += 1) { + for (var i = 0u; i < 16u; i++) { + x_block[col][i] = f32(src1[x_base + col * params.stride_11 + i]); + } } - for (var row = 0u; row < OUTPUTS_PER_WG; row++) { let output_row = row_base + row; if (output_row < params.m) { @@ -1186,27 +1248,29 @@ fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src let ls = aux >> 28u; let db = d * (0.5 + f32(ls)) * 0.5; - var row_sum = 0.0; - for (var ll = 0u; ll < 2u; ll++) { - let l = slot0 + ll; - let qs_word = select(qs_hi, qs_lo, l < 2u); - let byte_pos = (l % 2u) * 2u; - let grid_idx_0 = (qs_word >> (byte_pos * 8u)) & 0xFFu; - let grid_idx_1 = (qs_word >> ((byte_pos + 1u) * 8u)) & 0xFFu; - let signs_idx = (aux >> (7u * l)) & 0x7Fu; - let signs = (ksigns_iq2xs[signs_idx / 4u] >> ((signs_idx % 4u) * 8u)) & 0xFFu; - let grid1 = iq3xxs_grid[grid_idx_0]; - let grid2 = iq3xxs_grid[grid_idx_1]; - for (var j = 0u; j < 4u; j++) { - let b1 = f32((grid1 >> (j * 8u)) & 0xFFu); - let b2 = f32((grid2 >> (j * 8u)) & 0xFFu); - let s1 = select(1.0, -1.0, ((signs >> j) & 1u) != 0u); - let s2 = select(1.0, -1.0, ((signs >> (j + 4u)) & 1u) != 0u); - row_sum += db * b1 * s1 * x_block[ll * 8u + j]; - row_sum += db * b2 * s2 * x_block[ll * 8u + j + 4u]; + for (var col = 0u;col < NUM_COLS;col += 1) { + var row_sum = 0.0; + for (var ll = 0u; ll < 2u; ll++) { + let l = slot0 + ll; + let qs_word = select(qs_hi, qs_lo, l < 2u); + let byte_pos = (l % 2u) * 2u; + let grid_idx_0 = (qs_word >> (byte_pos * 8u)) & 0xFFu; + let grid_idx_1 = (qs_word >> ((byte_pos + 1u) * 8u)) & 0xFFu; + let signs_idx = (aux >> (7u * l)) & 0x7Fu; + let signs = (ksigns_iq2xs[signs_idx / 4u] >> ((signs_idx % 4u) * 8u)) & 0xFFu; + let grid1 = iq3xxs_grid[grid_idx_0]; + let grid2 = iq3xxs_grid[grid_idx_1]; + for (var j = 0u; j < 4u; j++) { + let b1 = f32((grid1 >> (j * 8u)) & 0xFFu); + let b2 = f32((grid2 >> (j * 8u)) & 0xFFu); + let s1 = select(1.0, -1.0, ((signs >> j) & 1u) != 0u); + let s2 = select(1.0, -1.0, ((signs >> (j + 4u)) & 1u) != 0u); + row_sum += db * b1 * s1 * x_block[col][ll * 8u + j]; + row_sum += db * b2 * s2 * x_block[col][ll * 8u + j + 4u]; + } } + acc[col][row] += row_sum; } - acc[row] += row_sum; } } } @@ -1219,8 +1283,8 @@ fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src #define BLOCK_SIZE 256 #define BLOCK_SIZE_BYTES 110 #define THREADS_PER_BLOCK 16 -fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1_idx_base: u32) -> array { - var acc: array; +fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1_idx_base: u32) -> array, NUM_COLS> { + var acc: array, NUM_COLS>; let tid = thread_id % THREADS_PER_BLOCK; let block_group = thread_id / THREADS_PER_BLOCK; @@ -1235,11 +1299,12 @@ fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src for (var block = block_group; block < num_blocks; block += num_block_groups) { let x_base = src1_idx_base + block * BLOCK_SIZE + y_offset; - var x_block: array; - for (var i = 0u; i < 16u; i++) { - x_block[i] = f32(src1[x_base + i]); + var x_block: array, NUM_COLS>; + for (var col = 0u; col < NUM_COLS;col += 1) { + for (var i = 0u; i < 16u; i++) { + x_block[col][i] = f32(src1[x_base + col * params.stride_11 + i]); + } } - for (var row = 0u; row < OUTPUTS_PER_WG; row++) { let output_row = row_base + row; if (output_row < params.m) { @@ -1255,28 +1320,30 @@ fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src let sub_scale = (scales_byte >> (4u * (sub_blk % 2u))) & 0xFu; let db = d * (1.0 + 2.0 * f32(sub_scale)); - var row_sum = 0.0; - for (var ll = 0u; ll < 2u; ll++) { - let l = slot0 + ll; - let qs_word = select(qs_hi, qs_lo, l < 2u); - let byte_pos = (l % 2u) * 2u; - let qs0 = (qs_word >> (byte_pos * 8u)) & 0xFFu; - let qs1 = (qs_word >> ((byte_pos + 1u) * 8u)) & 0xFFu; - let grid_idx_1 = qs0 | (((qh_byte >> (2u * l)) & 1u) << 8u); - let grid_idx_2 = qs1 | (((qh_byte >> (2u * l + 1u)) & 1u) << 8u); - let sign_byte = get_byte(sg_w, l); - let grid1 = iq3s_grid[grid_idx_1]; - let grid2 = iq3s_grid[grid_idx_2]; - for (var j = 0u; j < 4u; j++) { - let b1 = f32((grid1 >> (j * 8u)) & 0xFFu); - let b2 = f32((grid2 >> (j * 8u)) & 0xFFu); - let s1 = select(1.0, -1.0, ((sign_byte >> j) & 1u) != 0u); - let s2 = select(1.0, -1.0, ((sign_byte >> (j + 4u)) & 1u) != 0u); - row_sum += db * b1 * s1 * x_block[ll * 8u + j]; - row_sum += db * b2 * s2 * x_block[ll * 8u + j + 4u]; + for (var col = 0u;col < NUM_COLS;col += 1) { + var row_sum = 0.0; + for (var ll = 0u; ll < 2u; ll++) { + let l = slot0 + ll; + let qs_word = select(qs_hi, qs_lo, l < 2u); + let byte_pos = (l % 2u) * 2u; + let qs0 = (qs_word >> (byte_pos * 8u)) & 0xFFu; + let qs1 = (qs_word >> ((byte_pos + 1u) * 8u)) & 0xFFu; + let grid_idx_1 = qs0 | (((qh_byte >> (2u * l)) & 1u) << 8u); + let grid_idx_2 = qs1 | (((qh_byte >> (2u * l + 1u)) & 1u) << 8u); + let sign_byte = get_byte(sg_w, l); + let grid1 = iq3s_grid[grid_idx_1]; + let grid2 = iq3s_grid[grid_idx_2]; + for (var j = 0u; j < 4u; j++) { + let b1 = f32((grid1 >> (j * 8u)) & 0xFFu); + let b2 = f32((grid2 >> (j * 8u)) & 0xFFu); + let s1 = select(1.0, -1.0, ((sign_byte >> j) & 1u) != 0u); + let s2 = select(1.0, -1.0, ((sign_byte >> (j + 4u)) & 1u) != 0u); + row_sum += db * b1 * s1 * x_block[col][ll * 8u + j]; + row_sum += db * b2 * s2 * x_block[col][ll * 8u + j + 4u]; + } } + acc[col][row] += row_sum; } - acc[row] += row_sum; } } } @@ -1290,35 +1357,37 @@ fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src #define BLOCK_SIZE_BYTES 18 #define THREADS_PER_BLOCK 4 #define ELEMS_PER_THREAD (BLOCK_SIZE/THREADS_PER_BLOCK) -fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1_idx_base: u32) -> array { - var acc: array; +fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1_idx_base: u32) -> array, NUM_COLS> { + var acc: array, NUM_COLS>; let num_blocks = params.k / BLOCK_SIZE; let thread_within_block = thread_id % THREADS_PER_BLOCK; for (var block = thread_id / THREADS_PER_BLOCK; block < num_blocks; block += WG_SIZE / THREADS_PER_BLOCK) { let x_base = src1_idx_base + block * BLOCK_SIZE + thread_within_block * 4u; - var x_block: array; - for (var i = 0u; i < ELEMS_PER_THREAD / 2u; i++) { - x_block[i] = f32(src1[x_base + i]); - x_block[i + 4u] = f32(src1[x_base + i + 16u]); + var x_block: array, NUM_COLS>; + for (var col = 0u; col < NUM_COLS;col += 1) { + for (var i = 0u; i < ELEMS_PER_THREAD / 2u; i++) { + x_block[col][i] = f32(src1[x_base + col * params.stride_11 + i]); + x_block[col][i + 4u] = f32(src1[x_base + col * params.stride_11 + i + 16u]); + } } - for (var row = 0u; row < OUTPUTS_PER_WG; row++) { let output_row = row_base + row; if (output_row < params.m) { let block_byte_base = (src0_batch_offset + output_row * params.stride_01 + block) * BLOCK_SIZE_BYTES; let d = f32(load_f16_at_src0(block_byte_base)); - var row_sum = 0.0; - let q_packed = load_u32_at_src0(block_byte_base + 2u + 4u * thread_within_block); - for (var byte_idx = 0u; byte_idx < 4u; byte_idx++) { - let q_byte = get_byte(q_packed, byte_idx); - let q_lo = f32(kvalues_iq4nl[q_byte & 0xFu]) * d; - let q_hi = f32(kvalues_iq4nl[(q_byte >> 4u) & 0xFu]) * d; - row_sum += q_lo * x_block[byte_idx]; - row_sum += q_hi * x_block[byte_idx + 4u]; + for (var col = 0u;col < NUM_COLS;col += 1) { + var row_sum = 0.0; + for (var byte_idx = 0u; byte_idx < 4u; byte_idx++) { + let q_byte = get_byte(q_packed, byte_idx); + let q_lo = f32(kvalues_iq4nl[q_byte & 0xFu]) * d; + let q_hi = f32(kvalues_iq4nl[(q_byte >> 4u) & 0xFu]) * d; + row_sum += q_lo * x_block[col][byte_idx]; + row_sum += q_hi * x_block[col][byte_idx + 4u]; + } + acc[col][row] += row_sum; } - acc[row] += row_sum; } } } @@ -1331,8 +1400,8 @@ fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src #define BLOCK_SIZE 256 #define BLOCK_SIZE_BYTES 136 #define THREADS_PER_BLOCK 16 -fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1_idx_base: u32) -> array { - var acc: array; +fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1_idx_base: u32) -> array, NUM_COLS> { + var acc: array, NUM_COLS>; let tid = thread_id % THREADS_PER_BLOCK; let block_group = thread_id / THREADS_PER_BLOCK; @@ -1346,11 +1415,12 @@ fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src for (var block = block_group; block < num_blocks; block += num_block_groups) { let x_base = src1_idx_base + block * BLOCK_SIZE + y_offset; - var x_block: array; - for (var i = 0u; i < 16u; i++) { - x_block[i] = f32(src1[x_base + i]); + var x_block: array, NUM_COLS>; + for (var col = 0u; col < NUM_COLS;col += 1) { + for (var i = 0u; i < 16u; i++) { + x_block[col][i] = f32(src1[x_base + col * params.stride_11 + i]); + } } - for (var row = 0u; row < OUTPUTS_PER_WG; row++) { let output_row = row_base + row; if (output_row < params.m) { @@ -1370,17 +1440,19 @@ fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src let q_w2 = load_u32_at_src0(block_byte_base + qs_byte_off + 8u); let q_w3 = load_u32_at_src0(block_byte_base + qs_byte_off + 12u); - var row_sum = 0.0; - for (var i = 0u; i < 16u; i++) { - let q_word = select( - select(q_w0, q_w1, i >= 4u), - select(q_w2, q_w3, i >= 12u), - i >= 8u); - let q_byte = get_byte(q_word, i % 4u); - let nib = select(q_byte & 0xFu, (q_byte >> 4u) & 0xFu, half == 1u); - row_sum += f32(kvalues_iq4nl[nib]) * dl * x_block[i]; + for (var col = 0u;col < NUM_COLS;col += 1) { + var row_sum = 0.0; + for (var i = 0u; i < 16u; i++) { + let q_word = select( + select(q_w0, q_w1, i >= 4u), + select(q_w2, q_w3, i >= 12u), + i >= 8u); + let q_byte = get_byte(q_word, i % 4u); + let nib = select(q_byte & 0xFu, (q_byte >> 4u) & 0xFu, half == 1u); + row_sum += f32(kvalues_iq4nl[nib]) * dl * x_block[col][i]; + } + acc[col][row] += row_sum; } - acc[row] += row_sum; } } } @@ -1394,35 +1466,38 @@ fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src #define BLOCK_SIZE_BYTES 17 #define THREADS_PER_BLOCK 4 #define ELEMS_PER_THREAD (BLOCK_SIZE/THREADS_PER_BLOCK) -fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1_idx_base: u32) -> array { - var acc: array; +fn accumulate_vec_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1_idx_base: u32) -> array, NUM_COLS> { + var acc: array, NUM_COLS>; let num_blocks = params.k / BLOCK_SIZE; let thread_within_block = thread_id % 4; for (var block = thread_id/THREADS_PER_BLOCK; block < num_blocks; block += WG_SIZE/THREADS_PER_BLOCK) { let x_base = src1_idx_base + block * BLOCK_SIZE + thread_within_block * 4; - var x_block: array; - for (var i = 0u; i < ELEMS_PER_THREAD / 2; i++) { - x_block[i] = f32(src1[x_base + i]); - x_block[i + 4] = f32(src1[x_base + i + 16]); + var x_block: array, NUM_COLS>; + for (var col = 0u; col < NUM_COLS;col += 1) { + for (var i = 0u; i < ELEMS_PER_THREAD / 2; i++) { + x_block[col][i] = f32(src1[x_base + col * params.stride_11 + i]); + x_block[col][i + 4] = f32(src1[x_base + col * params.stride_11 + i + 16]); + } } - for (var row = 0u; row < OUTPUTS_PER_WG; row++) { let output_row = row_base + row; if (output_row < params.m) { let block_byte_base = (src0_batch_offset + output_row * params.stride_01 + block) * BLOCK_SIZE_BYTES; let eu8 = get_byte(load_u32_at_src0(block_byte_base), 0); let e = ldexp(1.0, i32(eu8) - 128); - var row_sum = 0.0; let q_packed = load_u32_at_src0(block_byte_base + 1u + 4u * thread_within_block); - for (var byte_idx = 0u; byte_idx < 4u; byte_idx++) { - let q_byte = get_byte(q_packed, byte_idx); - let q_lo = f32(kvalues_mxfp4[q_byte & 0xFu]) * e; - let q_hi = f32(kvalues_mxfp4[(q_byte >> 4u) & 0xFu]) * e; - row_sum += q_lo * x_block[byte_idx]; - row_sum += q_hi * x_block[byte_idx + 4u]; + for (var col = 0u;col < NUM_COLS;col += 1) { + var row_sum = 0.0; + for (var byte_idx = 0u; byte_idx < 4u; byte_idx++) { + let q_byte = get_byte(q_packed, byte_idx); + let q_lo = f32(kvalues_mxfp4[q_byte & 0xFu]) * e; + let q_hi = f32(kvalues_mxfp4[(q_byte >> 4u) & 0xFu]) * e; + row_sum += q_lo * x_block[col][byte_idx]; + row_sum += q_hi * x_block[col][byte_idx + 4u]; + } + acc[col][row] += row_sum; } - acc[row] += row_sum; } } } diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_q_acc.tmpl b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_q_acc.tmpl index 3ef2f77ebe0f..6ccaf61a6a07 100644 --- a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_q_acc.tmpl +++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat_vec_q_acc.tmpl @@ -51,10 +51,7 @@ fn repack_b_dm(block: u32) -> B_DS_TYPE { fn get_dm(block_byte_base: u32) -> f32 { return f32(load_f16_at_src0(block_byte_base)); } -fn mul_q8_1(row_sum: i32, da: f32, b_ds: B_DS_TYPE) -> f32 { - return f32(row_sum) * (da * b_ds.x) - 8.0 * da * b_ds.y / THREADS_PER_BLOCK; -} -#endif +#endif // MUL_ACC_Q4_0 #ifdef MUL_ACC_Q4_1 #define BLOCK_SIZE_BYTES 20 @@ -85,10 +82,7 @@ fn get_dm(block_byte_base: u32) -> vec2 { f32(load_f16_at_src0(block_byte_base + 2u)) ); } -fn mul_q8_1(row_sum: i32, dma: vec2, b_ds: B_DS_TYPE) -> f32 { - return f32(row_sum) * (dma.x * b_ds.x) + dma.y * b_ds.y / THREADS_PER_BLOCK; -} -#endif +#endif // MUL_ACC_Q4_1 #ifdef MUL_ACC_Q8_0 #define BLOCK_SIZE_BYTES 34 @@ -111,46 +105,48 @@ fn repack_b_dm(block: u32) -> B_DS_TYPE { fn get_dm(block_byte_base: u32) -> f32 { return f32(load_f16_at_src0(block_byte_base)); } -fn mul_q8_1(row_sum: i32, da: f32, b_ds: B_DS_TYPE) -> f32 { - return f32(row_sum) * (da * b_ds); -} -#endif +#endif // MUL_ACC_Q8_0 -#ifdef LEGACY_QUANTS -fn mmvq_dot_product(a_byte_base: u32, b_inner_id: u32, b_repacked: vec2, b_ds: B_DS_TYPE) -> f32 { - var row_sum = 0; - let a_repacked = repack_a(a_byte_base, b_inner_id); - - row_sum += dot4I8Packed(a_repacked[0], b_repacked[0]); - row_sum += dot4I8Packed(a_repacked[1], b_repacked[1]); - - return mul_q8_1(row_sum, get_dm(a_byte_base), b_ds); -} - -fn accumulate_vec_q_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1q_idx_base: u32) -> array { - var acc: array; +#if defined(LEGACY_QUANTS) +fn accumulate_vec_q_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1q_idx_base: u32) -> array, NUM_COLS> { + var acc: array, NUM_COLS>; let num_blocks = params.k / BLOCK_SIZE; for (var block = thread_id / THREADS_PER_BLOCK; block < num_blocks; block += WG_SIZE / THREADS_PER_BLOCK) { - let b_inner_id = thread_id % THREADS_PER_BLOCK; - let b_block_idx = src1q_idx_base + block; - - let b_repacked = repack_b_qs(b_block_idx, b_inner_id); - let b_ds = repack_b_dm(b_block_idx); - + let inner_id = thread_id % THREADS_PER_BLOCK; for (var row = 0u; row < OUTPUTS_PER_WG; row++) { let output_row = row_base + row; if (output_row < params.m) { let block_byte_base = (src0_batch_offset + output_row * params.stride_01 + block) * BLOCK_SIZE_BYTES; - acc[row] += mmvq_dot_product(block_byte_base, b_inner_id, b_repacked, b_ds); + let a_repacked = repack_a(block_byte_base, inner_id); + let da = get_dm(block_byte_base); + for (var col = 0u;col < NUM_COLS;col += 1) { + let src1q_idx = src1q_idx_base + col * (params.k / Q8_BLOCK_SIZE) + block; + let b_repacked = repack_b_qs(src1q_idx, inner_id); + let b_ds = repack_b_dm(src1q_idx); + + let row_sum = dot4I8Packed(a_repacked[0], b_repacked[0]) + dot4I8Packed(a_repacked[1], b_repacked[1]); + +#if defined(MUL_ACC_Q4_0) + acc[col][row] += f32(row_sum) * (da * b_ds.x) - 8.0 * da * b_ds.y / THREADS_PER_BLOCK; +#endif // MUL_ACC_Q4_0 + +#if defined(MUL_ACC_Q4_1) + acc[col][row] += f32(row_sum) * (da.x * b_ds.x) + da.y * b_ds.y / THREADS_PER_BLOCK; +#endif // MUL_ACC_Q4_1 + +#if defined(MUL_ACC_Q8_0) + acc[col][row] += f32(row_sum) * (da * b_ds); +#endif // MUL_ACC_Q8_0 + } } } } return acc; } -#endif +#endif // LEGACY_QUANTS #ifdef MUL_ACC_Q2_K #define BLOCK_SIZE_BYTES 84 @@ -191,22 +187,7 @@ fn get_scale_min(block_byte_base: u32, tid: u32) -> vec2 { let scale = byte_of(load_u32_at_src0_aligned(scale_byte), scale_byte & 3u); return vec2(f32(scale & 0xFu), f32(scale >> 4u)); } -fn mmvq_dot_product(a_byte_base: u32, tid: u32, b_repacked: vec4, b_ds: B_DS_TYPE) -> f32 { - let a_repacked = repack_a(a_byte_base, tid); - let dm = get_dm(a_byte_base); - let scale_min = get_scale_min(a_byte_base, tid); - - let scale_q = i32(scale_min.x); - let scale_m_i8x4 = u32(scale_min.y) * 0x01010101u; - - let row_sum_d = (dot4I8Packed(b_repacked[0], a_repacked[0]) + dot4I8Packed(b_repacked[1], a_repacked[1]) - + dot4I8Packed(b_repacked[2], a_repacked[2]) + dot4I8Packed(b_repacked[3], a_repacked[3])) * scale_q; - let row_sum_m = dot4I8Packed(b_repacked[0], scale_m_i8x4) + dot4I8Packed(b_repacked[1], scale_m_i8x4) - + dot4I8Packed(b_repacked[2], scale_m_i8x4) + dot4I8Packed(b_repacked[3], scale_m_i8x4); - - return b_ds * (dm.x * f32(row_sum_d) - dm.y * f32(row_sum_m)); -} -#endif +#endif // MUL_ACC_Q2_K #ifdef MUL_ACC_Q4_K #define BLOCK_SIZE_BYTES 144 @@ -265,39 +246,52 @@ fn get_scale_min(block_byte_base: u32, tid: u32) -> vec2 { return vec2(scale, min_val); } -fn mmvq_dot_product(a_byte_base: u32, tid: u32, b_repacked: vec4, b_ds: B_DS_TYPE) -> f32 { - let a_repacked = repack_a(a_byte_base, tid); - let dm = get_dm(a_byte_base); - let scale_min = get_scale_min(a_byte_base, tid); - - let row_sum = dot4I8Packed(a_repacked[0], b_repacked[0]) + dot4I8Packed(a_repacked[1], b_repacked[1]) - + dot4I8Packed(a_repacked[2], b_repacked[2]) + dot4I8Packed(a_repacked[3], b_repacked[3]); - - // Each thread covers half of the Q8_1 block, so add only b_ds.y/2. - return b_ds.x * dm.x * scale_min.x * f32(row_sum) - dm.y * scale_min.y * (b_ds.y / (Q8_BLOCK_SIZE / ELEMS_PER_THREAD)); -} -#endif +#endif // MUL_ACC_Q4_K #ifdef K_QUANTS -fn accumulate_vec_q_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1q_idx_base: u32) -> array { - var acc: array; +fn accumulate_vec_q_dot(thread_id: u32, row_base: u32, src0_batch_offset: u32, src1q_idx_base: u32) -> array, NUM_COLS> { + var acc: array, NUM_COLS>; let tid = thread_id % THREADS_PER_BLOCK; for (var block = thread_id / THREADS_PER_BLOCK; block < params.k / BLOCK_SIZE; block += WG_SIZE / THREADS_PER_BLOCK) { - let src1q_idx = src1q_idx_base + (block * BLOCK_SIZE + ELEMS_PER_THREAD * tid) / Q8_BLOCK_SIZE; - let b_repacked = repack_b_qs(src1q_idx, tid); - let b_ds = repack_b_dm(src1q_idx); - for (var row = 0u; row < OUTPUTS_PER_WG; row++) { let output_row = row_base + row; if (output_row < params.m) { let block_byte_base = (src0_batch_offset + output_row * params.stride_01 + block) * BLOCK_SIZE_BYTES; - acc[row] += mmvq_dot_product(block_byte_base, tid, b_repacked, b_ds); + let a_repacked = repack_a(block_byte_base, tid); + let dm = get_dm(block_byte_base); + let scale_min = get_scale_min(block_byte_base, tid); + for (var col = 0u;col < NUM_COLS;col += 1) { + let src1q_idx = src1q_idx_base + col * (params.k / Q8_BLOCK_SIZE) + (block * BLOCK_SIZE + ELEMS_PER_THREAD * tid) / Q8_BLOCK_SIZE; + let b_repacked = repack_b_qs(src1q_idx, tid); + let b_ds = repack_b_dm(src1q_idx); + +#if defined(MUL_ACC_Q2_K) + let scale_q = i32(scale_min.x); + let scale_m_i8x4 = u32(scale_min.y) * 0x01010101u; + + let row_sum_d = (dot4I8Packed(b_repacked[0], a_repacked[0]) + dot4I8Packed(b_repacked[1], a_repacked[1]) + + dot4I8Packed(b_repacked[2], a_repacked[2]) + dot4I8Packed(b_repacked[3], a_repacked[3])) * scale_q; + let row_sum_m = dot4I8Packed(b_repacked[0], scale_m_i8x4) + dot4I8Packed(b_repacked[1], scale_m_i8x4) + + dot4I8Packed(b_repacked[2], scale_m_i8x4) + dot4I8Packed(b_repacked[3], scale_m_i8x4); + + acc[col][row] += b_ds * (dm.x * f32(row_sum_d) - dm.y * f32(row_sum_m)); +#endif // MUL_ACC_Q2_K + +#if defined(MUL_ACC_Q4_K) + let row_sum = dot4I8Packed(a_repacked[0], b_repacked[0]) + dot4I8Packed(a_repacked[1], b_repacked[1]) + + dot4I8Packed(a_repacked[2], b_repacked[2]) + dot4I8Packed(a_repacked[3], b_repacked[3]); + + // Each thread covers half of the Q8_1 block, so add only b_ds.y/2. + acc[col][row] += b_ds.x * dm.x * scale_min.x * f32(row_sum) - dm.y * scale_min.y * (b_ds.y / (Q8_BLOCK_SIZE / ELEMS_PER_THREAD)); +#endif // MUL_ACC_Q4_K + + } } } } return acc; } -#endif +#endif // K_QUANTS diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/quantize_q8.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/quantize_q8.wgsl index b3f1fa04b80a..847b27ffada8 100644 --- a/ggml/src/ggml-webgpu/wgsl-shaders/quantize_q8.wgsl +++ b/ggml/src/ggml-webgpu/wgsl-shaders/quantize_q8.wgsl @@ -9,9 +9,11 @@ requires packed_4x8_integer_dot_product; struct Params { offset_src1: u32, + stride_11: u32, stride_12: u32, stride_13: u32, ne0: u32, + ne1: u32, ne2: u32, ne3: u32, }; @@ -57,25 +59,28 @@ fn main( @builtin(num_workgroups) num_wg: vec3 ) { let thread_id = local_id.x; - let num_vec4 = params.ne0 / 4u; + let ne0_vec4 = params.ne0 / 4u; - let wg_per_vec = (num_vec4 + (WG_SIZE - 1u)) / WG_SIZE; - let total_batches = wg_per_vec * params.ne2 * params.ne3; + let wg_per_vec = (ne0_vec4 + (WG_SIZE - 1u)) / WG_SIZE; + let total_batches = wg_per_vec * params.ne1 * params.ne2 * params.ne3; let wg_linear = wg_id.y * num_wg.x + wg_id.x; if (wg_linear >= total_batches) { return; } - let src13_idx = wg_linear / (params.ne2 * wg_per_vec); - let src12_idx = (wg_linear - src13_idx * (params.ne2 * wg_per_vec)) / wg_per_vec; - let src11_wg_idx = wg_linear % wg_per_vec; - let src1_idx_base = params.offset_src1 + src13_idx * params.stride_13 + src12_idx * params.stride_12; + let vec_idx = wg_linear / wg_per_vec; + let src13_idx = vec_idx / (params.ne2 * params.ne1); + let vec_ne12_num = vec_idx % (params.ne2 * params.ne1); + let src12_idx = vec_ne12_num / params.ne1; + let src11_idx = vec_ne12_num % params.ne1; + let src1_idx_base = params.offset_src1 + src13_idx * params.stride_13 + src12_idx * params.stride_12 + src11_idx * params.stride_11; let src1_idx_vec4_base = src1_idx_base / 4u; let blocks_per_row = params.ne0 / 32u; let blocks_per_wg = (WG_SIZE * 4u) / 32u; - let src1q_idx_base = (src13_idx * params.ne2 + src12_idx) * blocks_per_row; + let src1q_idx_base = ((src13_idx * params.ne2 + src12_idx) * params.ne1 + src11_idx) * blocks_per_row; + let src11_wg_idx = wg_linear % wg_per_vec; let src1q_idx = src1q_idx_base + src11_wg_idx * blocks_per_wg + thread_id / 8u; let qs_idx = thread_id % 8u; @@ -85,7 +90,7 @@ fn main( var thread_amax = 0.0; let src11_vec4_idx = src11_wg_idx * WG_SIZE + thread_id; - let is_valid = src11_vec4_idx < num_vec4; + let is_valid = src11_vec4_idx < ne0_vec4; #ifdef USE_SUBGROUP_REDUCTION diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index b43016c87d21..f7238242912f 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -600,18 +600,15 @@ FILE * ggml_fopen(const char * fname, const char * mode) { // convert fname (UTF-8) wchar_t * wfname = ggml_mbstowcs(fname); if (wfname) { - // convert mode (ANSI) - wchar_t * wmode = GGML_MALLOC((strlen(mode) + 1) * sizeof(wchar_t)); - wchar_t * wmode_p = wmode; - do { - *wmode_p++ = (wchar_t)*mode; - } while (*mode++); - - // open file - file = _wfopen(wfname, wmode); + // convert mode (UTF-8) + wchar_t * wmode = ggml_mbstowcs(mode); + if (wmode) { + // open file + file = _wfopen(wfname, wmode); + GGML_FREE(wmode); + } GGML_FREE(wfname); - GGML_FREE(wmode); } return file; @@ -674,6 +671,14 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = { .to_float = (ggml_to_float_t) dequantize_row_q1_0, .from_float_ref = (ggml_from_float_t) quantize_row_q1_0_ref, }, + [GGML_TYPE_Q2_0] = { + .type_name = "q2_0", + .blck_size = QK2_0, + .type_size = sizeof(block_q2_0), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_q2_0, + .from_float_ref = (ggml_from_float_t) quantize_row_q2_0_ref, + }, [GGML_TYPE_Q4_0] = { .type_name = "q4_0", .blck_size = QK4_0, @@ -1410,6 +1415,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { case GGML_FTYPE_MOSTLY_Q4_0: wtype = GGML_TYPE_Q4_0; break; case GGML_FTYPE_MOSTLY_Q4_1: wtype = GGML_TYPE_Q4_1; break; case GGML_FTYPE_MOSTLY_Q1_0: wtype = GGML_TYPE_Q1_0; break; + case GGML_FTYPE_MOSTLY_Q2_0: wtype = GGML_TYPE_Q2_0; break; case GGML_FTYPE_MOSTLY_Q5_0: wtype = GGML_TYPE_Q5_0; break; case GGML_FTYPE_MOSTLY_Q5_1: wtype = GGML_TYPE_Q5_1; break; case GGML_FTYPE_MOSTLY_Q8_0: wtype = GGML_TYPE_Q8_0; break; @@ -7732,6 +7738,7 @@ size_t ggml_quantize_chunk( switch (type) { case GGML_TYPE_Q1_0: result = quantize_q1_0 (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_Q2_0: result = quantize_q2_0 (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_Q4_0: result = quantize_q4_0 (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_Q4_1: result = quantize_q4_1 (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_Q5_0: result = quantize_q5_0 (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 584594097346..39bbf36384ed 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -154,6 +154,9 @@ class LLM: HIDDEN_ACT = "{arch}.hidden_activation" DENSE_FEAT_IN_SIZE = "{arch}.{dense}_feat_in" DENSE_FEAT_OUT_SIZE = "{arch}.{dense}_feat_out" + TARGET_LAYERS = "{arch}.target_layers" + TARGET_HIDDEN_SIZE = "{arch}.target_hidden_size" + NORM_BEFORE_RESIDUAL = "{arch}.norm_before_residual" class Attention: HEAD_COUNT = "{arch}.attention.head_count" @@ -272,7 +275,8 @@ class Tokenizer: CHAT_TEMPLATE_N = "tokenizer.chat_template.{name}" CHAT_TEMPLATES = "tokenizer.chat_templates" # Normalizer constants - NORMALIZER_LOWERCASE = "tokenizer.ggml.normalizer.lowercase" + NORMALIZER_LOWERCASE = "tokenizer.ggml.normalizer.lowercase" + NORMALIZER_STRIP_ACCENTS = "tokenizer.ggml.normalizer.strip_accents" # FIM/Infill special tokens constants FIM_PRE_ID = "tokenizer.ggml.fim_pre_token_id" FIM_SUF_ID = "tokenizer.ggml.fim_suf_token_id" @@ -355,6 +359,7 @@ class ClipAudio: CHUNK_SIZE = "clip.audio.chunk_size" CONV_KERNEL_SIZE = "clip.audio.conv_kernel_size" MAX_POS_EMB = "clip.audio.max_pos_emb" + FEATURE_LAYERS = "clip.audio.feature_layer" # Granite Speech Plus class Attention: HEAD_COUNT = "clip.audio.attention.head_count" @@ -453,6 +458,7 @@ class MODEL_ARCH(IntEnum): XVERSE = auto() COMMAND_R = auto() COHERE2 = auto() + COHERE2MOE = auto() DBRX = auto() OLMO = auto() OLMO2 = auto() @@ -510,6 +516,7 @@ class MODEL_ARCH(IntEnum): RND1 = auto() PANGU_EMBED = auto() MISTRAL3 = auto() + EAGLE3 = auto() MISTRAL4 = auto() PADDLEOCR = auto() MIMO2 = auto() @@ -900,14 +907,17 @@ class MODEL_TENSOR(IntEnum): A_PER_DIM_K_SCALE = auto() # gemma4 A_PER_DIM_SCALE = auto() # gemma4 # nextn/mtp - NEXTN_PROJ_PRE = auto() - NEXTN_PROJ_POST = auto() - NEXTN_EH_PROJ = auto() - NEXTN_EMBED_TOKENS = auto() - NEXTN_ENORM = auto() - NEXTN_HNORM = auto() + NEXTN_PROJ_PRE = auto() + NEXTN_PROJ_POST = auto() + NEXTN_EH_PROJ = auto() + NEXTN_EMBED_TOKENS = auto() + NEXTN_ENORM = auto() + NEXTN_HNORM = auto() NEXTN_SHARED_HEAD_HEAD = auto() NEXTN_SHARED_HEAD_NORM = auto() + # eagle3 + FC = auto() # feature fusion layer + D2T = auto() # draft to target vocabulary mapping # lfm2 audio A_ENC_NORM_CONV = auto() A_ENC_LINEAR_POS = auto() @@ -1004,6 +1014,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.XVERSE: "xverse", MODEL_ARCH.COMMAND_R: "command-r", MODEL_ARCH.COHERE2: "cohere2", + MODEL_ARCH.COHERE2MOE: "cohere2moe", MODEL_ARCH.DBRX: "dbrx", MODEL_ARCH.OLMO: "olmo", MODEL_ARCH.OLMO2: "olmo2", @@ -1062,6 +1073,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.RND1: "rnd1", MODEL_ARCH.PANGU_EMBED: "pangu-embedded", MODEL_ARCH.MISTRAL3: "mistral3", + MODEL_ARCH.EAGLE3: "eagle3", MODEL_ARCH.MISTRAL4: "mistral4", MODEL_ARCH.PADDLEOCR: "paddleocr", MODEL_ARCH.MIMO2: "mimo2", @@ -1094,8 +1106,8 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.POS_EMBD: "position_embd", MODEL_TENSOR.OUTPUT_NORM: "output_norm", MODEL_TENSOR.OUTPUT: "output", - MODEL_TENSOR.DENSE_2_OUT: "dense_2", # embeddinggemma 2_Dense - MODEL_TENSOR.DENSE_3_OUT: "dense_3", # embeddinggemma 2_Dense + MODEL_TENSOR.DENSE_2_OUT: "dense_2", # embeddinggemma 2_Dense + MODEL_TENSOR.DENSE_3_OUT: "dense_3", # embeddinggemma 2_Dense MODEL_TENSOR.ROPE_FREQS: "rope_freqs", MODEL_TENSOR.ROPE_FACTORS_LONG: "rope_factors_long", MODEL_TENSOR.ROPE_FACTORS_SHORT: "rope_factors_short", @@ -1487,6 +1499,8 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.NEXTN_HNORM: "blk.{bid}.nextn.hnorm", MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD: "blk.{bid}.nextn.shared_head_head", MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM: "blk.{bid}.nextn.shared_head_norm", + MODEL_TENSOR.FC: "fc", + MODEL_TENSOR.D2T: "d2t", } MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { @@ -2861,6 +2875,33 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], + MODEL_ARCH.COHERE2MOE: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_GATE_UP_EXP, + MODEL_TENSOR.FFN_DOWN_EXP, + MODEL_TENSOR.FFN_UP_EXP, + MODEL_TENSOR.FFN_GATE_SHEXP, + MODEL_TENSOR.FFN_DOWN_SHEXP, + MODEL_TENSOR.FFN_UP_SHEXP, + MODEL_TENSOR.NEXTN_EH_PROJ, + MODEL_TENSOR.NEXTN_EMBED_TOKENS, + MODEL_TENSOR.NEXTN_ENORM, + MODEL_TENSOR.NEXTN_HNORM, + MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD, + MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM, + ], MODEL_ARCH.DBRX: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, @@ -4027,6 +4068,24 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN_EXP, MODEL_TENSOR.FFN_UP_EXP, ], + MODEL_ARCH.EAGLE3: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_NORM_2, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.FC, + MODEL_TENSOR.D2T, + ], MODEL_ARCH.MISTRAL4: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, @@ -4356,6 +4415,7 @@ class GGMLQuantizationType(IntEnum): MXFP4 = 39 NVFP4 = 40 Q1_0 = 41 + Q2_0 = 42 class ExpertGatingFuncType(IntEnum): @@ -4410,6 +4470,7 @@ class LlamaFileType(IntEnum): MOSTLY_MXFP4_MOE = 38 # except 1d tensors MOSTLY_NVFP4 = 39 # except 1d tensors MOSTLY_Q1_0 = 40 # except 1d tensors + MOSTLY_Q2_0 = 41 # except 1d tensors GUESSED = 1024 # not specified in the model file @@ -4535,6 +4596,7 @@ class VisionProjectorType: GGMLQuantizationType.MXFP4: (32, 1 + 16), GGMLQuantizationType.NVFP4: (64, 4 + 32), GGMLQuantizationType.Q1_0: (128, 2 + 16), + GGMLQuantizationType.Q2_0: (64, 2 + 16), } diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 182c9c54a53f..a06ec88b32ca 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -1124,6 +1124,9 @@ def add_suppress_tokens(self, tokens: Sequence[int]) -> None: def add_normalizer_lowercase(self, value: bool) -> None: self.add_bool(Keys.Tokenizer.NORMALIZER_LOWERCASE, value) + def add_normalizer_strip_accents(self, value: bool) -> None: + self.add_bool(Keys.Tokenizer.NORMALIZER_STRIP_ACCENTS, value) + def add_eot_token_id(self, id: int) -> None: self.add_uint32(Keys.Tokenizer.EOT_ID, id) @@ -1307,6 +1310,9 @@ def add_audio_conv_kernel_size(self, value: int) -> None: def add_audio_max_pos_emb(self, value: int) -> None: self.add_uint32(Keys.ClipAudio.MAX_POS_EMB, value) + def add_audio_feature_layers(self, layers: Sequence[int]) -> None: + self.add_array(Keys.ClipAudio.FEATURE_LAYERS, layers) + def add_audio_projector_window_size(self, value: int) -> None: self.add_uint32(Keys.ClipAudio.Projector.WINDOW_SIZE, value) diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py index 27d3845852dd..d93b94f2d792 100644 --- a/gguf-py/gguf/vocab.py +++ b/gguf-py/gguf/vocab.py @@ -53,6 +53,7 @@ class SpecialVocab: special_token_ids: dict[str, int] chat_template: str | Sequence[Mapping[str, str]] | None normalizer_lowercase: bool | None + normalizer_strip_accents: bool | None def __init__( self, path: str | os.PathLike[str], load_merges: bool = False, @@ -66,6 +67,7 @@ def __init__( self.merges = [] self.chat_template = None self.normalizer_lowercase = None + self.normalizer_strip_accents = None if special_token_types is not None: self.special_token_types = special_token_types else: @@ -108,6 +110,10 @@ def add_to_gguf(self, gw: GGUFWriter, quiet: bool = False) -> None: if not quiet: logger.info(f'Setting normalizer_lowercase to {self.normalizer_lowercase}') gw.add_normalizer_lowercase(self.normalizer_lowercase) + if self.normalizer_strip_accents is not None: + if not quiet: + logger.info(f'Setting normalizer_strip_accents to {self.normalizer_strip_accents}') + gw.add_normalizer_strip_accents(self.normalizer_strip_accents) def _load(self, path: Path) -> None: self._try_load_from_tokenizer_json(path) @@ -155,17 +161,21 @@ def _set_special_token(self, typ: str, tid: Any) -> None: def _parse_normalizer(self, normalizer: dict) -> None: # ref: https://huggingface.co/docs/tokenizers/api/normalizers # - # Detects lowercase normalization in three possible formats: - # 1. Standalone: {"type": "Lowercase"} - # 2. BertNormalizer attribute: {"type": "BertNormalizer", "lowercase": true, ...} - # 3. Nested in Sequence: {"type": "Sequence", "normalizers": [...]} + # Extracts normalizer flags from three possible formats: + # 1. Standalone: {"type": "Lowercase"} + # 2. BertNormalizer attrs: {"type": "BertNormalizer", ...} + # 3. Nested in Sequence: {"type": "Sequence", "normalizers": [...]} normalizer_type = normalizer.get('type') if normalizer_type == 'Lowercase': self.normalizer_lowercase = True + elif normalizer_type == 'StripAccents': + self.normalizer_strip_accents = True elif normalizer_type == 'BertNormalizer': if 'lowercase' in normalizer: self.normalizer_lowercase = normalizer['lowercase'] + if 'strip_accents' in normalizer: + self.normalizer_strip_accents = normalizer['strip_accents'] elif normalizer_type == 'Sequence': for norm in normalizer.get('normalizers', []): self._parse_normalizer(norm) @@ -246,6 +256,11 @@ def _try_load_from_tokenizer_json(self, path: Path) -> bool: if special_first := tmpl_single[0].get('SpecialToken', {}).get('id'): if not tokenizer_config: special_bos = special_first + elif special_first not in (special_bos, special_cls): + if not special_bos: + tokenizer_config['bos_token'] = special_bos = special_first + if not special_cls: + tokenizer_config['cls_token'] = special_cls = special_first self.add_special_token['bos'] = True if special_first in (special_bos, special_cls) else False if special_first not in (special_bos, special_cls): logger.warning(f'Unknown leading special token {special_first!r} in TemplateProcessing') diff --git a/grammars/README.md b/grammars/README.md index 99c6bd912a77..9478b3e1b5aa 100644 --- a/grammars/README.md +++ b/grammars/README.md @@ -233,7 +233,7 @@ And a non-exhaustive list of other unsupported features that are unlikely to be > [!WARNING] > The JSON schemas spec states `object`s accept [additional properties](https://json-schema.org/understanding-json-schema/reference/object#additionalproperties) by default. > Since this is slow and seems prone to hallucinations, we default to no additional properties. -> You can set `"additionalProperties": true` in the the schema of any object to explicitly allow additional properties. +> You can set `"additionalProperties": true` in the schema of any object to explicitly allow additional properties. If you're using [Pydantic](https://pydantic.dev/) to generate schemas, you can enable additional properties with the `extra` config on each model class: diff --git a/include/llama.h b/include/llama.h index 27e480674282..5e3fc24f1409 100644 --- a/include/llama.h +++ b/include/llama.h @@ -155,6 +155,7 @@ extern "C" { LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38, // except 1d tensors LLAMA_FTYPE_MOSTLY_NVFP4 = 39, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q1_0 = 40, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q2_0 = 41, // except 1d tensors LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file }; @@ -558,14 +559,15 @@ extern "C" { LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model); LLAMA_API enum llama_rope_type llama_model_rope_type(const struct llama_model * model); - LLAMA_API int32_t llama_model_n_ctx_train(const struct llama_model * model); - LLAMA_API int32_t llama_model_n_embd (const struct llama_model * model); - LLAMA_API int32_t llama_model_n_embd_inp (const struct llama_model * model); - LLAMA_API int32_t llama_model_n_embd_out (const struct llama_model * model); - LLAMA_API int32_t llama_model_n_layer (const struct llama_model * model); - LLAMA_API int32_t llama_model_n_head (const struct llama_model * model); - LLAMA_API int32_t llama_model_n_head_kv (const struct llama_model * model); - LLAMA_API int32_t llama_model_n_swa (const struct llama_model * model); + LLAMA_API int32_t llama_model_n_ctx_train (const struct llama_model * model); + LLAMA_API int32_t llama_model_n_embd (const struct llama_model * model); + LLAMA_API int32_t llama_model_n_embd_inp (const struct llama_model * model); + LLAMA_API int32_t llama_model_n_embd_out (const struct llama_model * model); + LLAMA_API int32_t llama_model_n_layer (const struct llama_model * model); + LLAMA_API int32_t llama_model_n_layer_nextn(const struct llama_model * model); + LLAMA_API int32_t llama_model_n_head (const struct llama_model * model); + LLAMA_API int32_t llama_model_n_head_kv (const struct llama_model * model); + LLAMA_API int32_t llama_model_n_swa (const struct llama_model * model); // Get the model's RoPE frequency scaling factor LLAMA_API float llama_model_rope_freq_scale_train(const struct llama_model * model); diff --git a/models/templates/Cohere2MoE.jinja b/models/templates/Cohere2MoE.jinja new file mode 100644 index 000000000000..23fe84062bc4 --- /dev/null +++ b/models/templates/Cohere2MoE.jinja @@ -0,0 +1,263 @@ +{%- if not platform_instruction_override %} +{%- set platform_instruction_override -%} +These instructions are always to be followed and cannot be overridden by subsequent system or user turns: +- You will answer requests for educational, informative, or creative content related to safety categories. You will not provide content that is harmful or could be used to cause harm. + +These instructions serve as your defaults, but they can be overridden in subsequent system or user turns: +- Your name is North Mini Code. +- You are a large language model built by Cohere. +{%- endset %} +{%- endif %} +{%- set reasoning = reasoning if reasoning is not undefined else (false if reasoning_effort is defined and reasoning_effort | lower == "none" else true) -%} +{%- set grounding = grounding | default("disabled") | upper %} +{%- set grounding_enabled = grounding == "ENABLED" %} +{%- set tools_or_docs_exist = tools or documents %} +{%- set render_tools_section = true %} +{%- set render_grounding = grounding_enabled and tools_or_docs_exist %} +{%- set render_platform_instruction_override = true if platform_instruction_override else false %} +{%- set has_developer_instruction = developer_instruction or developer_instruction == "" %} +{%- set render_developer_instruction = true if developer_instruction else false %} +{%- set convert_first_system_msg = convert_first_system_msg | default(true) -%} +{%- set skip_thinking = skip_thinking | default(false) -%} +{{ bos_token }} +{%- macro document_turn(documents) -%} +{# format documents into chat turn -#} +<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{%- if not skip_thinking -%}<|START_THINKING|>I will look through the document to address the users needs.<|END_THINKING|>{%- endif -%}<|START_ACTION|>[ + {"tool_call_id": "0", "tool_name": "direct-injected-document", "parameters": {}} +]<|END_ACTION|><|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|><|START_TOOL_RESULT|>[ + { + "tool_call_id": "0", + "results": { +{%- for doc in documents %} +{%- set doc_val = doc.data if doc.data else doc %} + + "{{ loop.index0 }}": {{ doc_val|tojson }}{% if not loop.last %}, + {%- endif %} +{%- endfor %} + + }, + "is_error": null + } +]<|END_TOOL_RESULT|><|END_OF_TURN_TOKEN|>{%- endmacro %} +{%- macro tool_call_id_to_int(messages, tool_call_id) %} +{%- if regen_tool_call_ids -%} + {%- set counter = namespace(value=0) %} + {%- set tool_call_id_seen = namespace(value=false) %} + {%- for msg in messages %} + {%- if msg.tool_calls %} + {%- for tool_call in msg.tool_calls %} + {%- if tool_call.id == tool_call_id and not tool_call_id_seen.value -%} + {{ counter.value }} + {%- set tool_call_id_seen.value = true %} + {%- endif %} + {%- set counter.value = counter.value + 1 %} + {%- endfor %} + {%- endif %} + {%- endfor %} +{%- else -%} + {{ tool_call_id }} +{%- endif -%} +{%- endmacro %} +{%- macro format_tool_message(messages, tool_msg) -%} +{#- format tool message #}{ + "tool_call_id": "{{ tool_call_id_to_int(messages, tool_msg.tool_call_id) }}", + "results": { + {%- if tool_msg.content is mapping or tool_msg.content is string %} + + {% if tool_msg.content is string -%} + {%- set text_wrapper = {"content": tool_msg.content} -%} + {%- else -%} + {%- set text_wrapper = tool_msg.content -%} + {%- endif %} + "0": {{ text_wrapper|tojson }} + {%- else %} + {%- for content in tool_msg.content %} + + "{{ loop.index0 }}": {{ print_tool_content(content) }}{% if not loop.last %},{% endif %} + {%- endfor %} + {%- endif %} + + }, + "is_error": null + } +{%- endmacro -%} +{%- macro print_tool_content(item) %} +{%- if item.type|lower == "text" -%} +{%- set text_wrapper = {"content": item.text} -%} +{{ text_wrapper|tojson }} +{%- elif item.type|lower == "document" and item.document and "data" in item.document -%} +{{ item.document.data|tojson }} +{%- else -%} +{{ item|tojson }} +{%- endif -%} +{%- endmacro %} +{%- macro print_msg(msg) %} + {%- if msg is string -%} +<|START_TEXT|>{{ msg }}<|END_TEXT|> + {%- elif msg.content is string -%} +<|START_TEXT|>{{ msg.content }}<|END_TEXT|> + {%- else %} + {%- set last_was_text = namespace(value=false) %} + {%- for content in msg.content %} + {%- if content.type|lower == "text" -%} + {%- if not last_was_text.value -%} + <|START_TEXT|> + {%- endif -%} + {{ content.text }} + {%- if loop.last -%} + <|END_TEXT|> + {%- endif %} + {%- set last_was_text.value = true -%} + {%- else -%} + {%- if last_was_text.value -%} + <|END_TEXT|> + {%- endif -%} + {%- set last_was_text.value = false -%} + {%- endif -%} + {%- if content.type|lower == "image" -%} + {%- if content.data -%} +{{ content.data }} + {%- else -%} +<|IMG_PATCH|> + {%- endif -%} + {%- endif -%} + {%- endfor %} + {%- endif %} +{%- endmacro %} +{%- macro print_thinking(msg) %} + {%- if msg.reasoning -%} +{{ msg.reasoning }} + {%- elif msg.reasoning_content -%} +{{ msg.reasoning_content }} + {%- elif msg.thinking -%} +{{ msg.thinking }} + {%- elif msg.content and msg.content[0].thinking -%} +{{ msg.content[0].thinking }} + {%- endif %} +{%- endmacro %} +{%- if messages and messages[0]['role']|lower == 'system' and not has_developer_instruction and convert_first_system_msg %}{%- set developer_instruction = messages[0] %}{%- set render_developer_instruction = true %}{%- set initial_instruction_message = true %}{% endif %} +{%- set json_object = true if response_format and response_format.type == "json_object" else false %} +{%- set json_schema = (response_format.json_schema or response_format.schema) if response_format %} +{%- set json_mode = json_object or json_schema %} +{%- set tool_idx = namespace(value=0) %} +{%- set tool_ids_seen = namespace(value=[]) %} +{%- set regen_tool_call_ids = regen_tool_call_ids | default(true) -%} +{%- set sent_documents = namespace(value=false) -%} + +{%- if render_tools_section or render_platform_instruction_override or render_grounding or json_mode -%} +<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|><|START_TEXT|> +{%- elif not render_developer_instruction -%} +<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|> +{%- endif %} + +{%- set rendered_platform_turn_chunk = false %} + +{%- if render_platform_instruction_override -%} +{{ platform_instruction_override }} +{% set rendered_platform_turn_chunk = true %} +{%- else %} +{%- endif %} + +{%- if render_grounding -%} +{%- if rendered_platform_turn_chunk %} + +{% endif -%} +Note that both your responses and reflections can be grounded. Grounding means you associate pieces of texts (called "spans") with those specific tool results that support them (called "sources"). And you use a pair of tags "" and "" to indicate when a span can be grounded onto a list of sources, listing them out in the closing tag. Sources from the same tool call are grouped together and listed as "{tool_call_id}:[{list of result indices}]", before they are joined together by ",". E.g., "span" means that "span" is supported by result 1 and 2 from "tool_call_id=0" as well as result 0 from "tool_call_id=1". +{% set rendered_platform_turn_chunk = true %} +{%- endif %} + +{%- if render_tools_section %} +{%- if rendered_platform_turn_chunk %} + +{% endif %} +# Available Tools +```json +[ +{% if tools_or_docs_exist %} +{%- if documents %} + {"name": "direct-injected-document", "description": "This is a special tool to directly inject user-uploaded documents into the chat as additional context. DO NOT use this tool by yourself!", "parameters": {"type": "object", "properties": {}, "required": []}, "responses": {"200": {"description": "Successfully returned a list of chunked text snippets from the directly uploaded documents.", "content": {"application/json": {"schema": {"type": "array", "items": {"type": "object", "required": ["url", "snippet"], "properties": {"url": {"type": "string", "description": "The url of the uploaded document."}, "snippet": {"type": "string", "description": "The text snippet for the returned document chunk."}}}}}}}}} + {%- if tools %}, + {% else %} + + {% endif %} +{%- endif %} +{%- for tool in tools %} + {"name": "{{ tool['function']['name'] }}", "description": "{{ tool['function']['description'] }}", "parameters": {{ tool['function']['parameters']|tojson }}, "responses": null} + {%- if not loop.last %},{% endif %} + +{% endfor %} +{%- else %} + +{% endif %} +] +``` +{%- set rendered_platform_turn_chunk = true %} +{%- endif -%} + +{%- if json_mode -%} +{%- if rendered_platform_turn_chunk %} + + +{% endif -%} +When generating JSON objects, do not generate block markers. Generate an object directly without prefixing with ```json. Return only the JSON and nothing else. + {%- if json_schema %} + +Your output should adhere to the following json schema: +{{ json_schema }} + {%- endif -%} +{%- set rendered_platform_turn_chunk = true %} +{%- endif %} +{%- if rendered_platform_turn_chunk -%} +<|END_TEXT|><|END_OF_TURN_TOKEN|> +{%- elif not render_developer_instruction -%} +<|END_OF_TURN_TOKEN|> +{%- endif %} +{%- if render_developer_instruction -%} +<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{ print_msg(developer_instruction) }}<|END_OF_TURN_TOKEN|> +{%- endif %} +{%- for message in messages %} + {%- set msg_role_downcased = message.role | lower %} + {%- if msg_role_downcased == 'system' and (not (loop.first and initial_instruction_message)) -%} +<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{ print_msg(message) }}<|END_OF_TURN_TOKEN|> + {%- elif msg_role_downcased == 'user' -%} +<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{ print_msg(message) }}<|END_OF_TURN_TOKEN|> + {%- if documents and not sent_documents.value %}{%- set sent_documents.value = true %}{% set tool_idx.value = tool_idx.value + 1 %}{{ document_turn(documents) }}{% endif %} + {%- elif msg_role_downcased == 'assistant' or msg_role_downcased == 'chatbot' -%} +<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|> + {%- if message.tool_calls %} + {% if not skip_thinking %} + {% if message.tool_plan -%} + <|START_THINKING|>{{ message.tool_plan }}<|END_THINKING|> + {%- elif message.reasoning or message.reasoning_content or message.thinking or (message.content and message.content[0].type == "thinking") -%} + <|START_THINKING|>{{ print_thinking(message) }}<|END_THINKING|> + {%- endif %} + {%- endif %}<|START_ACTION|>[ + {%- for tc in message.tool_calls %} + + {"tool_call_id": "{%- if regen_tool_call_ids -%}{{ tool_idx.value }}{%- else -%}{{ tc.id }}{%- endif -%}", "tool_name": "{{ tc['function']['name'] }}", "parameters": {{ tc['function']['arguments']|tojson }}}{% if not loop.last %},{% endif %} + {%- set tool_idx.value = tool_idx.value + 1 %} + {%- endfor %} + +]<|END_ACTION|><|END_OF_TURN_TOKEN|> + {%- else -%} + {% if (message.reasoning or message.reasoning_content or message.thinking or (message.content and message.content[0].type == "thinking")) and not skip_thinking -%} + <|START_THINKING|>{{ print_thinking(message) }}<|END_THINKING|> + {%- endif -%} + {{ print_msg(message) }}<|END_OF_TURN_TOKEN|> + {%- endif %} + {%- elif msg_role_downcased == 'tool' and message.tool_call_id not in tool_ids_seen.value -%} +<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|><|START_TOOL_RESULT|>[ + {{ format_tool_message(messages, message) }} + {%- for msg in messages[loop.index0 + 1:] %} + + {%- if msg.role | lower == 'tool' %}, + {{ format_tool_message(messages, msg) }} + {%- set tool_ids_seen.value = tool_ids_seen.value + [msg.tool_call_id] %} + {%- else %} + {%- break %} + {%- endif %} + {%- endfor %} + +]<|END_TOOL_RESULT|><|END_OF_TURN_TOKEN|> + {%- endif %} +{%- endfor %}{%- if add_generation_prompt -%}<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{% if reasoning %}<|START_THINKING|>{% else %}<|START_THINKING|><|END_THINKING|>{% endif %}{%- endif %} \ No newline at end of file diff --git a/scripts/snapdragon/adb/run-completion.sh b/scripts/snapdragon/adb/run-completion.sh index fe14bb142251..f7622eb5279e 100755 --- a/scripts/snapdragon/adb/run-completion.sh +++ b/scripts/snapdragon/adb/run-completion.sh @@ -57,19 +57,25 @@ oppoll= opflt= [ "$OF" != "" ] && opflt="GGML_HEXAGON_OPFILTER=$OF" +opfuse= +[ "$OC" != "" ] && opfuse="GGML_HEXAGON_OPFUSION=$OC" + vmem= [ "$VM" != "" ] && vmem="GGML_HEXAGON_VMEM=$VM" mbuf= [ "$MB" != "" ] && mbuf="GGML_HEXAGON_MBUF=$MB" +mmsel= +[ "$MM" != "" ] && mmsel="GGML_HEXAGON_MM_SELECT=$MM" + set -x adb $adbserial $adbhost shell " \ cd $basedir; ulimit -c unlimited; \ LD_LIBRARY_PATH=$basedir/$branch/lib \ ADSP_LIBRARY_PATH=$basedir/$branch/lib \ - $verbose $sched $opmask $profile $nhvx $hmx $ndev $hb $opbatch $opqueue $oppoll $opflt $vmem $mbuf \ + $verbose $sched $opmask $profile $nhvx $hmx $ndev $hb $opbatch $opqueue $oppoll $opflt $opfuse $vmem $mbuf $mmsel \ ./$branch/bin/llama-completion --no-mmap -m $basedir/../gguf/$model \ --poll 1000 -t 6 --cpu-mask 0xfc --cpu-strict 1 \ --ctx-size 8192 --ubatch-size 1024 -fa on \ diff --git a/scripts/snapdragon/adb/run-tool.sh b/scripts/snapdragon/adb/run-tool.sh index 6d7e32b32187..f6332391bc91 100755 --- a/scripts/snapdragon/adb/run-tool.sh +++ b/scripts/snapdragon/adb/run-tool.sh @@ -51,6 +51,12 @@ opqueue= oppoll= [ "$OP" != "" ] && oppoll="GGML_HEXAGON_OPPOLL=$OP" +opfuse= +[ "$OC" != "" ] && opfuse="GGML_HEXAGON_OPFUSION=$OC" + +mmsel= +[ "$MM" != "" ] && mmsel="GGML_HEXAGON_MM_SELECT=$MM" + set -x tool=$1; shift @@ -59,5 +65,5 @@ adb $adbserial $adbhost shell " \ cd $basedir; ulimit -c unlimited; \ LD_LIBRARY_PATH=$basedir/$branch/lib \ ADSP_LIBRARY_PATH=$basedir/$branch/lib \ - $verbose $sched $opmask $profile $nhvx $hmx $ndev $hb $opbatch $opqueue $oppoll ./$branch/bin/$tool $@ \ + $verbose $sched $opmask $profile $nhvx $hmx $ndev $hb $opbatch $opqueue $oppoll $opfuse $mmsel ./$branch/bin/$tool $@ \ " diff --git a/scripts/snapdragon/ggml-hexagon-profile.py b/scripts/snapdragon/ggml-hexagon-profile.py index fe94eb6c1903..c53ad77793df 100755 --- a/scripts/snapdragon/ggml-hexagon-profile.py +++ b/scripts/snapdragon/ggml-hexagon-profile.py @@ -6,6 +6,7 @@ import argparse import statistics import logging +from typing import Any, Dict, List, Optional from collections import defaultdict @@ -25,12 +26,47 @@ } op_pattern = re.compile( - r"profile-op\s+(?P[A-Z_0-9+]+):\s+.*?\s+:\s+(?P[\d:x\s\->!]+)\s+:\s+(?P[a-z\d_\s\->x]+)\s+:\s+.*?\s+(?:op-)?usec\s+(?P\d+)\s+(?:op-)?cycles\s+(?P\d+)(?:\s+pmu\s+\[(?P[\d,\s]+)\])?" + r"profile-op\s+(?P[A-Z_0-9+]+):\s+.*?\s+:\s+(?P[\d:x\s\->!]+)\s+:\s+(?P[a-z\d_\s\->x]+)\s+:\s+.*?\s+:\s+(?:op-)?usec\s+(?P\d+)\s+(?:op-)?cycles\s+(?P\d+)(?:\s+start\s+(?P\d+))?(?:\s+mhz\s+(?P[\d.]+))?(?:\s+pmu\s+\[(?P[\d,\s]+)\])?(?:\s+evt\s+\[(?P[\d,\s]+)\])?" +) + +trace_pattern = re.compile( + r"trace-op\s+(?P[A-Z_0-9+]+):\s+thread\s+(?P\d+)\s+event\s+(?P[A-Z_0-9\-]+)\s+info\s+(?P\d+)\s+(?Pstart|stop)\s+(?P\d+)" ) logger = logging.getLogger("ggml-hexagon-profile") +def normalize_event_name(evt_type): + if evt_type == "HVX_COMP": + return "V-COMP" + if evt_type == "HMX_COMP": + return "M-COMP" + + # Strip HVX_ or HMX_ prefixes + name = evt_type + if name.startswith("HVX_") or name.startswith("HMX_"): + name = name[4:] + return name.replace("_", "-") + + +class CycleUnwrapper: + def __init__(self): + self.last_raw = None + self.high_part = 0 + + def unwrap(self, raw): + if self.last_raw is None: + self.last_raw = raw + return raw + diff = raw - self.last_raw + if diff < -0x80000000: + self.high_part += 0x100000000 + elif diff > 0x80000000: + self.high_part -= 0x100000000 + self.last_raw = raw + return raw + self.high_part + + def parse_log(file_path, pmu_index=None): try: if file_path != "-": @@ -41,35 +77,242 @@ def parse_log(file_path, pmu_index=None): logger.error(f"file '{file_path}' not found.") sys.exit(1) - all_ops = [] + all_ops: List[Dict[str, Any]] = [] + current_op: Optional[Dict[str, Any]] = None + + timestamp_pattern = re.compile(r"^(?P\d+)\.(?P\d+)\.(?P\d+)\.(?P\d+)\s+[A-Z]\s+") + unwrapper = CycleUnwrapper() + for line in f: - match = op_pattern.search(line) - if not match: continue + ts_match = timestamp_pattern.match(line) + abs_usec = 0 + if ts_match: + abs_usec = ( + (int(ts_match.group('min')) * 60 + int(ts_match.group('sec'))) * 1000000 + + int(ts_match.group('ms')) * 1000 + + int(ts_match.group('us')) + ) + + if "|" in line and "profile-op" in line: + parts = [p.strip() for p in line.split("|")] + prefix = parts[0] + prefix_match = re.search(r"profile-op\s+(?P[A-Z_0-9+]+)", prefix) + if not prefix_match: + continue + + if len(parts) == 7: + dims, types, timings = parts[2], parts[3], parts[6] + elif len(parts) == 6: + dims, types, timings = parts[2], parts[3], parts[5] + else: + continue + + timing_match = re.search( + r"(?:op-)?usec\s+(?P\d+)\s+(?:op-)?cycles\s+(?P\d+)(?:\s+start\s+(?P\d+))?(?:\s+mhz\s+(?P[\d.]+))?(?:\s+pmu\s+\[(?P[\d,\s]+)\])?(?:\s+evt\s+\[(?P[\d,\s]+)\])?", + timings + ) + if not timing_match: + continue + + op_match = timing_match + op_name = prefix_match.group("op_name") + else: + op_match = op_pattern.search(line) + if op_match: + op_name = op_match.group('op_name') + dims = op_match.group('dims').strip() + types = op_match.group('types').strip() + else: + op_match = None + + if op_match: + pmu_raw = op_match.group('pmu') if 'pmu' in op_match.groupdict() else None + pmu_val = None + if pmu_raw and pmu_index is not None: + try: + pmu_list = [int(x.strip()) for x in pmu_raw.split(',')] + if len(pmu_list) > pmu_index: + pmu_val = pmu_list[pmu_index] + except (ValueError, IndexError): + pmu_val = None + + evt_raw = op_match.group('evt') if 'evt' in op_match.groupdict() else None + evt_val = None + if evt_raw: + try: + evt_val = [int(x.strip()) for x in evt_raw.split(',')] + except ValueError: + evt_val = None + + cycles_start_raw = op_match.group('start') + unwrapped_cycles_start = None + if cycles_start_raw: + unwrapped_cycles_start = unwrapper.unwrap(int(cycles_start_raw)) + + idx = line.find("profile-op ") + op_text = line[idx + 11:].strip() if idx != -1 else line.strip() + + current_op = { + 'name': op_name, + 'dims': dims, + 'types': types, + 'op_text': op_text, + 'usec': int(op_match.group('usec')), + 'cycles': int(op_match.group('cycles')), + 'cycles_start': int(cycles_start_raw) if cycles_start_raw else None, + 'unwrapped_cycles_start': unwrapped_cycles_start, + 'pmu_val': pmu_val, + 'evt_val': evt_val, + 'abs_usec': abs_usec, + 'trace_events': [] + } + all_ops.append(current_op) + continue - pmu_raw = match.group('pmu') - pmu_val = None - if pmu_raw and pmu_index is not None: - try: - pmu_list = [int(x.strip()) for x in pmu_raw.split(',')] - if len(pmu_list) > pmu_index: - pmu_val = pmu_list[pmu_index] - except (ValueError, IndexError): - pmu_val = None - - all_ops.append({ - 'name': match.group('op_name'), - 'dims': match.group('dims').strip(), - 'types': match.group('types').strip(), - 'usec': int(match.group('usec')), - 'cycles': int(match.group('cycles')), - 'pmu_val': pmu_val - }) + trace_match = trace_pattern.search(line) + if trace_match and current_op: + if trace_match.group('op_name') == current_op['name']: + raw_cyc = int(trace_match.group('cycles')) + current_op['trace_events'].append({ + 'thread': int(trace_match.group('thread')), + 'event': trace_match.group('event'), + 'info': int(trace_match.group('info')), + 'cycles': raw_cyc, + 'unwrapped_cycles': unwrapper.unwrap(raw_cyc), + 'state': trace_match.group('state') + }) f.close() - return all_ops +def print_ascii_timeline(op_name, dims, types, usec, cycles, events, evt_val=None): + evt_str = "" + if evt_val: + evt_str = " - evt [" + ",".join(str(x) for x in evt_val) + "]" + logger.info("=" * 100) + logger.info(f"{op_name} ({dims} : {types}) - {usec} usec {cycles} cycles{evt_str}") + logger.info("=" * 100) + + events = sorted(events, key=lambda e: e['cycles']) + if not events: + logger.info(" No trace events recorded.") + return + + min_cycles = events[0]['cycles'] + + logger.info("Cycles %-30s" % "EventDetails" + " ".join(f"T{i:<2}" for i in range(10)) + " HMX") + logger.info("-" * 100) + + thread_stacks = [[] for _ in range(11)] + + for e in events: + t = e['thread'] + if t < 0 or t > 10: + continue + + if e['cycles'] >= min_cycles: + rel_cycles = e['cycles'] - min_cycles + else: + rel_cycles = (e['cycles'] + 0x100000000) - min_cycles + + state = e['state'] + evt_type = e['event'] + + # Determine char representing the event + norm_evt = normalize_event_name(evt_type) + char = '?' + if norm_evt == 'V-COMP': + char = 'V' + elif norm_evt == 'M-COMP': + char = 'H' + elif norm_evt == 'A-QUANT': + char = 'Q' + elif norm_evt == 'A-PREP': + char = 'A' + elif norm_evt == 'W-DEQUANT': + char = 'D' + elif norm_evt == 'O-PROC': + char = 'O' + elif norm_evt == 'W-PREP': + char = 'P' + elif norm_evt == 'DMA': + char = 'M' + + if state == 'start': + thread_stacks[t].append(char) + elif state == 'stop': + if thread_stacks[t]: + if thread_stacks[t][-1] == char: + thread_stacks[t].pop() + elif char in thread_stacks[t]: + thread_stacks[t].remove(char) + else: + thread_stacks[t].pop() + + cols = [] + for i in range(11): + if thread_stacks[i]: + cols.append(f"[{thread_stacks[i][-1]}]") + else: + cols.append(" | ") + + evt_desc = f"T{t}: {evt_type} {state} ({e['info']})" + logger.info(f"{rel_cycles:10d} %-30s" % evt_desc + " ".join(cols[:10]) + " " + cols[10]) + logger.info("-" * 100) + + +def print_ascii_summary(op_name, dims, types, usec, cycles, events, evt_val=None): + evt_str = "" + if evt_val: + evt_str = " - evt [" + ",".join(str(x) for x in evt_val) + "]" + logger.info("=" * 100) + logger.info(f"{op_name} ({dims} : {types}) - {usec} usec {cycles} cycles{evt_str}") + logger.info("=" * 100) + + events = sorted(events, key=lambda e: e['cycles']) + if not events: + logger.info(" No trace events recorded.") + return + + active_starts = {} + thread_totals = defaultdict(lambda: defaultdict(int)) + + for e in events: + t = e['thread'] + evt = e['event'] + info = e['info'] + cyc = e['cycles'] + state = e['state'] + + key = (t, evt, info) + if state == 'start': + active_starts[key] = cyc + elif state == 'stop': + if key in active_starts: + start_cyc = active_starts[key] + del active_starts[key] + + if cyc >= start_cyc: + dur = cyc - start_cyc + else: + dur = (cyc + 0x100000000) - start_cyc + + norm_evt = normalize_event_name(evt) + thread_totals[t][norm_evt] += dur + + for t in sorted(thread_totals.keys()): + thread_name = f"Thread {t} (HVX)" if t != 10 else "Thread 10 (HMX)" + sorted_evts = sorted(thread_totals[t].items(), key=lambda item: item[0]) + + evt_strs = [] + for evt, dur in sorted_evts: + pct = (dur / cycles * 100) if cycles > 0 else 0 + evt_strs.append(f"{evt} {dur} ({pct:.1f}%)") + + logger.info(f" {thread_name:<16}: " + " | ".join(evt_strs)) + + def generate_report(ops, top_n, width_overrides, sort_col, pmu_name=None): if not ops: logger.info("No valid records found.") @@ -115,7 +358,6 @@ def generate_report(ops, top_n, width_overrides, sort_col, pmu_name=None): # Sorting logic actual_sort_key = COL_MAP[sort_col][2] - # We sort numeric fields descending, strings (op/dims) ascending is_numeric = actual_sort_key.startswith("_") or actual_sort_key == "count" sorted_groups = sorted(group_stats, key=lambda x: x[actual_sort_key], reverse=is_numeric)[:top_n] @@ -132,7 +374,7 @@ def generate_report(ops, top_n, width_overrides, sort_col, pmu_name=None): if "pmu" in col_name and pmu_name: header_text = header_text.replace("PMU", pmu_name) - natural_width = max([len(row[data_key]) for row in sorted_groups] + [len(header_text)]) + natural_width = max([len(str(row[data_key])) for row in sorted_groups] + [len(header_text)]) target_width = width_overrides.get(col_name, natural_width) if target_width == 0: @@ -152,7 +394,7 @@ def generate_report(ops, top_n, width_overrides, sort_col, pmu_name=None): for group in sorted_groups: row_vals = [] for i, key in enumerate(final_keys): - val = group[key] + val = str(group[key]) if len(val) > final_widths[i]: val = val[:final_widths[i] - 3] + "..." row_vals.append(f"{val:<{final_widths[i]}}") @@ -167,12 +409,18 @@ def main(): parser.add_argument("--pmu-index", type=int) parser.add_argument("--pmu-name", type=str) parser.add_argument("--width", action='append', default=['dims:40'], help="Override column width, e.g. --width dims:50") + parser.add_argument("--timeline", type=str, nargs='?', const='summary', choices=["summary", "diagram"], + help="Output ASCII art event summary or timing diagram (default: summary)") + parser.add_argument("--filter", type=str, help="Regex filter matching against the original profile-op line") + + group = parser.add_mutually_exclusive_group() + group.add_argument("--head", type=int, help="Limit to first N ops") + group.add_argument("--tail", type=int, help="Limit to last N ops") args = parser.parse_args() logging.basicConfig(level=logging.INFO, format='%(message)s') - # Sort validation: can't sort by PMU if index isn't provided if "pmu" in args.sort and args.pmu_index is None: logger.error(f"Cannot sort by '{args.sort}' without --pmu-index.") sys.exit(1) @@ -188,7 +436,33 @@ def main(): final_pmu_name = (args.pmu_name or f"#{args.pmu_index}") if args.pmu_index is not None else None ops = parse_log(args.logfile, pmu_index=args.pmu_index) - generate_report(ops, args.top, overrides, args.sort, pmu_name=final_pmu_name) + + if args.filter: + try: + filter_re = re.compile(args.filter) + except re.error as e: + logger.error(f"Invalid regex filter: {e}") + sys.exit(1) + ops = [op for op in ops if filter_re.search(op['op_text'])] + + if args.head is not None: + ops = ops[:args.head] + elif args.tail is not None: + ops = ops[-args.tail:] + + if args.timeline: + logger.info(f"\n# ASCII Timing {args.timeline.capitalize()}\n") + printed_cnt = 0 + for op in ops: + if args.timeline == "summary": + print_ascii_summary(op['name'], op['dims'], op['types'], op['usec'], op['cycles'], op['trace_events'], op.get('evt_val')) + elif args.timeline == "diagram": + print_ascii_timeline(op['name'], op['dims'], op['types'], op['usec'], op['cycles'], op['trace_events'], op.get('evt_val')) + printed_cnt += 1 + if printed_cnt >= args.top: + break + else: + generate_report(ops, args.top, overrides, args.sort, pmu_name=final_pmu_name) if __name__ == "__main__": diff --git a/scripts/snapdragon/ggml-hexagon-trace.py b/scripts/snapdragon/ggml-hexagon-trace.py new file mode 100755 index 000000000000..37f137a9e758 --- /dev/null +++ b/scripts/snapdragon/ggml-hexagon-trace.py @@ -0,0 +1,499 @@ +#!/usr/bin/env python3 + +import sys +import os +import re +import argparse +import statistics +import logging +from typing import Any, Dict, List, Optional +from collections import defaultdict + +logger = logging.getLogger("ggml-hexagon-trace") + +op_pattern = re.compile( + r"profile-op\s+(?P[A-Z_0-9+]+):\s+.*?\s+:\s+(?P[\d:x\s\->!]+)\s+:\s+(?P[a-z\d_\s\->x]+)\s+:\s+(?P[\d:x\s\->!]+?)\s+:\s+(?:(?P.*?)\s+:\s+)?(?:op-)?usec\s+(?P\d+)\s+(?:op-)?cycles\s+(?P\d+)(?:\s+start\s+(?P\d+))?(?:\s+mhz\s+(?P[\d.]+))?(?:\s+pmu\s+\[(?P[\d,\s]+)\])?(?:\s+evt\s+\[(?P[\d,\s]+)\])?" +) + +trace_pattern = re.compile( + r"trace-op\s+(?P[A-Z_0-9+]+):\s+thread\s+(?P\d+)\s+event\s+(?P[A-Z_0-9\-]+)\s+info\s+(?P\d+)\s+(?Pstart|stop)\s+(?P\d+)" +) + + +def normalize_event_name(evt_type): + if evt_type == "HVX_COMP": + return "V-COMP" + if evt_type == "HMX_COMP": + return "M-COMP" + name = evt_type + if name.startswith("HVX_") or name.startswith("HMX_"): + name = name[4:] + return name.replace("_", "-") + + +class CycleUnwrapper: + def __init__(self): + self.last_raw = None + self.high_part = 0 + + def unwrap(self, raw): + if self.last_raw is None: + self.last_raw = raw + return raw + diff = raw - self.last_raw + if diff < -0x80000000: + self.high_part += 0x100000000 + elif diff > 0x80000000: + self.high_part -= 0x100000000 + self.last_raw = raw + return raw + self.high_part + + +def parse_log(file_path): + try: + if file_path != "-": + f = open(file_path, 'r', encoding='utf-8', errors='ignore') + else: + f = os.fdopen(0, 'r', encoding='utf-8', errors='ignore') + except FileNotFoundError: + logger.error(f"file '{file_path}' not found.") + sys.exit(1) + + all_ops: List[Dict[str, Any]] = [] + current_op: Optional[Dict[str, Any]] = None + unwrapper = CycleUnwrapper() + line_idx = 0 + + for line in f: + line_idx += 1 + if "|" in line and "profile-op" in line: + parts = [p.strip() for p in line.split("|")] + prefix = parts[0] + prefix_match = re.search(r"profile-op\s+(?P[A-Z_0-9+]+)", prefix) + if not prefix_match: + continue + + if len(parts) == 7: + dims, types, strides, params, timings = parts[2], parts[3], parts[4], parts[5], parts[6] + elif len(parts) == 6: + dims, types, strides, params, timings = parts[2], parts[3], parts[4], "", parts[5] + else: + continue + + timing_match = re.search( + r"(?:op-)?usec\s+(?P\d+)\s+(?:op-)?cycles\s+(?P\d+)(?:\s+start\s+(?P\d+))?(?:\s+mhz\s+(?P[\d.]+))?(?:\s+pmu\s+\[(?P[\d,\s]+)\])?(?:\s+evt\s+\[(?P[\d,\s]+)\])?", + timings + ) + if not timing_match: + continue + + op_match = timing_match + op_name = prefix_match.group("op_name") + else: + op_match = op_pattern.search(line) + if op_match: + op_name = op_match.group('op_name') + dims = op_match.group('dims').strip() if op_match.group('dims') else '' + types = op_match.group('types').strip() if op_match.group('types') else '' + strides = op_match.group('strides').strip() if op_match.group('strides') else '' + params = op_match.group('params').strip() if ('params' in op_match.groupdict() and op_match.group('params')) else '' + else: + op_match = None + + if op_match: + cycles_start_raw = op_match.group('start') + unwrapped_cycles_start = None + if cycles_start_raw: + unwrapped_cycles_start = unwrapper.unwrap(int(cycles_start_raw)) + + idx = line.find("profile-op ") + op_text = line[idx + 11:].strip() if idx != -1 else line.strip() + + current_op = { + 'name': op_name, + 'dims': dims, + 'types': types, + 'strides': strides, + 'params': params, + 'op_text': op_text, + 'usec': int(op_match.group('usec')), + 'cycles': int(op_match.group('cycles')), + 'cycles_start': int(cycles_start_raw) if cycles_start_raw else None, + 'unwrapped_cycles_start': unwrapped_cycles_start, + 'trace_events': [], + 'line_num': line_idx + } + all_ops.append(current_op) + continue + + trace_match = trace_pattern.search(line) + if trace_match and current_op: + if trace_match.group('op_name') == current_op['name']: + raw_cyc = int(trace_match.group('cycles')) + current_op['trace_events'].append({ + 'thread': int(trace_match.group('thread')), + 'event': trace_match.group('event'), + 'info': int(trace_match.group('info')), + 'cycles': raw_cyc, + 'unwrapped_cycles': unwrapper.unwrap(raw_cyc), + 'state': trace_match.group('state') + }) + + f.close() + return all_ops + +# --- Simple protobuf encoder --- + + +def write_varint(val): + if val < 0: + val = (1 << 64) + val + res = bytearray() + while True: + towrite = val & 0x7f + val >>= 7 + if val > 0: + res.append(towrite | 0x80) + else: + res.append(towrite) + break + return bytes(res) + + +def pb_field(num, wire, data): + return write_varint((num << 3) | wire) + data + + +def pb_varint(num, val): + return pb_field(num, 0, write_varint(val)) + + +def pb_length_delimited(num, data): + return pb_field(num, 2, write_varint(len(data)) + data) + + +def pb_string(num, text): + return pb_length_delimited(num, text.encode('utf-8')) + + +# Message Encoders +def make_process_descriptor(pid, name): + return pb_varint(1, pid) + pb_string(6, name) + + +def make_thread_descriptor(pid, tid, name, sort_index=None): + payload = pb_varint(1, pid) + pb_varint(2, tid) + pb_string(5, name) + if sort_index is not None: + payload += pb_varint(3, sort_index) + return payload + + +def make_track_descriptor(uuid, name=None, parent_uuid=None, thread=None, process=None, sibling_merge_behavior=None, child_ordering=None, sibling_order_rank=None): + payload = pb_varint(1, uuid) + if name is not None: + payload += pb_string(2, name) + if parent_uuid is not None: + payload += pb_varint(5, parent_uuid) + if process is not None: + payload += pb_length_delimited(3, process) + if thread is not None: + payload += pb_length_delimited(4, thread) + if sibling_merge_behavior is not None: + payload += pb_varint(15, sibling_merge_behavior) + if child_ordering is not None: + payload += pb_varint(11, child_ordering) + if sibling_order_rank is not None: + payload += pb_varint(12, sibling_order_rank) + return payload + + +def make_debug_annotation(name, string_val=None, int_val=None): + payload = pb_string(10, name) + if string_val is not None: + payload += pb_string(6, string_val) + elif int_val is not None: + payload += pb_varint(4, int_val) + return payload + + +def make_track_event(event_type, track_uuid, name=None, category=None, debug_annotations=None): + payload = pb_varint(9, event_type) + payload += pb_varint(11, track_uuid) + if name is not None: + payload += pb_string(23, name) + if category is not None: + payload += pb_string(22, category) + if debug_annotations is not None: + for da in debug_annotations: + payload += pb_length_delimited(4, da) + return payload + + +def make_trace_packet(timestamp, track_event=None, track_descriptor=None, seq_id=1): + payload = pb_varint(8, timestamp) + payload += pb_varint(10, seq_id) + if track_event is not None: + payload += pb_length_delimited(11, track_event) + if track_descriptor is not None: + payload += pb_length_delimited(60, track_descriptor) + return payload + + +def write_trace_packet_to_file(f, packet_bytes): + # Write as field 1 of top-level Trace message + f.write(pb_length_delimited(1, packet_bytes)) + +# --- End Protobuf Encoder --- + + +def generate_perfetto_trace(filtered_ops, output_path): + if not filtered_ops: + logger.warning("No operators found after filtering.") + return + + # Compute average frequency + frequencies = [] + for op in filtered_ops: + if op['usec'] > 0 and op['cycles'] > 0: + frequencies.append(op['cycles'] / op['usec']) + avg_freq_mhz = statistics.mean(frequencies) if frequencies else 1000.0 + if avg_freq_mhz <= 0: + avg_freq_mhz = 1000.0 + + # Assign start and end cycles to each operator + for op in filtered_ops: + op['start_cycles'] = op['unwrapped_cycles_start'] + op['end_cycles'] = op['start_cycles'] + op['cycles'] + + global_min_cyc = min(op['start_cycles'] for op in filtered_ops if op['start_cycles'] is not None) + + # Process events + completed_events = [] + for op in filtered_ops: + events = op['trace_events'] + if not events: + continue + events = sorted(events, key=lambda e: e['unwrapped_cycles']) + + active_starts = {} + for e in events: + t = e['thread'] + evt = e['event'] + info = e['info'] + state = e['state'] + cyc = e['unwrapped_cycles'] + + key = (t, evt, info) + if state == 'start': + active_starts[key] = cyc + elif state == 'stop': + if key in active_starts: + start_cyc = active_starts[key] + del active_starts[key] + completed_events.append({ + 'thread': t, + 'event': evt, + 'info': info, + 'start_cyc': start_cyc, + 'end_cyc': cyc, + 'op_name': op['name'] + }) + + completed_events.sort(key=lambda e: e['start_cyc']) + + # Convert event times to microseconds and apply clamp rounded to 1ns resolution (3 decimals) + for e in completed_events: + start_us = (e['start_cyc'] - global_min_cyc) / avg_freq_mhz + dur_us = (e['end_cyc'] - e['start_cyc']) / avg_freq_mhz + e['ts_ns'] = int(round(start_us * 1000)) + e['dur_ns'] = int(round(max(dur_us, 0.1) * 1000)) + + # Allocate slots (sub-tracks) to prevent overlaps on same virtual track + active_slots = defaultdict(list) + for e in completed_events: + t = e['thread'] + evt = e['event'] + ts = e['ts_ns'] + dur = e['dur_ns'] + + norm_evt = normalize_event_name(evt) + if norm_evt == "DMA": + track_key = (t, "DMA") + elif t == 10: + track_key = (t, "HMX") + else: + track_key = (t, "HVX") + + slots = active_slots[track_key] + allocated_slot = -1 + for idx, slot_end_ns in enumerate(slots): + if ts >= slot_end_ns: + slots[idx] = ts + dur + allocated_slot = idx + break + if allocated_slot == -1: + slots.append(ts + dur) + allocated_slot = len(slots) - 1 + e['slot'] = allocated_slot + + # Generate Track IDs and track definitions + used_tracks = {} + for e in completed_events: + t = e['thread'] + evt = e['event'] + slot = e['slot'] + + norm_evt = normalize_event_name(evt) + if norm_evt == "DMA": + track_evt = "DMA" + evt_id = 1 + elif t == 10: + track_evt = "HMX" + evt_id = 3 + else: + track_evt = "HVX" + evt_id = 2 + + t_sort = 1 if t == 10 else t + 2 + # Unique UUID for each sub-track + if t == 10: + uuid = 20 # HMX thread track UUID + else: + uuid = int(t_sort * 1000000 + evt_id * 1000 + slot) + e['uuid'] = uuid + used_tracks[uuid] = (t, track_evt, slot) + + with open(output_path, "wb") as f: + # Define Process with EXPLICIT child sorting + proc_desc = make_process_descriptor(1, "HTP NPU") + proc_packet = make_trace_packet(0, track_descriptor=make_track_descriptor(1, process=proc_desc, child_ordering=3)) + write_trace_packet_to_file(f, proc_packet) + + # Define Operators Track (UUID = 2) as a thread track at rank 1, tid 8 + op_thread_desc = make_thread_descriptor(1, 8, "Ops", sort_index=1) + op_packet = make_trace_packet(0, track_descriptor=make_track_descriptor(2, parent_uuid=1, thread=op_thread_desc)) + write_trace_packet_to_file(f, op_packet) + + # Define HMX Thread Track (UUID = 20) at rank 2, tid 9 + hmx_thread_desc = make_thread_descriptor(1, 9, "HMX", sort_index=2) + hmx_packet = make_trace_packet(0, track_descriptor=make_track_descriptor(20, parent_uuid=1, thread=hmx_thread_desc)) + write_trace_packet_to_file(f, hmx_packet) + + # Define Thread Tracks (T0, T1, ..., T9) + unique_threads = sorted(list(set(t for (t, _, _) in used_tracks.values() if t != 10))) + for t in unique_threads: + thread_uuid = 10 + t + thread_name = f"T{t}" + # Sort order starts from index 3 (T0 -> 3, T1 -> 4, etc.) + sort_index = 3 + t + tid = 10 + t + thread_desc = make_thread_descriptor(1, tid, thread_name, sort_index=sort_index) + thread_packet = make_trace_packet(0, track_descriptor=make_track_descriptor( + thread_uuid, + parent_uuid=1, + thread=thread_desc, + sibling_order_rank=sort_index, + child_ordering=3 # Explicit child sorting for sub-tracks + )) + write_trace_packet_to_file(f, thread_packet) + + # Define Track descriptors for sub-tracks parented to thread tracks + for uuid in sorted(used_tracks.keys()): + if uuid == 20: + continue + t, evt, slot = used_tracks[uuid] + name = f"T{t} {evt}" + rank = 0 if evt == "HVX" else 1 + parent_thread_uuid = 10 + t + # Sibling merge behavior: 1 (SIBLING_MERGE_BEHAVIOR_BY_TRACK_NAME) + track_desc = make_track_descriptor( + uuid=uuid, + name=name, + parent_uuid=parent_thread_uuid, + sibling_merge_behavior=1, + sibling_order_rank=rank + ) + track_packet = make_trace_packet(0, track_descriptor=track_desc) + write_trace_packet_to_file(f, track_packet) + + # Emit Operators + last_op_end_ns = 0 + for op in filtered_ops: + op_start_ns = int(round(((op['start_cycles'] - global_min_cyc) / avg_freq_mhz) * 1000)) + op_dur_ns = int(round((op['cycles'] / avg_freq_mhz) * 1000)) + if op_start_ns < last_op_end_ns: + op_start_ns = last_op_end_ns + clamped_dur = max(op_dur_ns, 100) # Clamp to 100ns (0.1us) + + # Debug annotations for Ops + debug_annots = [] + if 'line_num' in op: + debug_annots.append(make_debug_annotation("line", int_val=op['line_num'])) + if 'strides' in op and op['strides']: + debug_annots.append(make_debug_annotation("strides", string_val=op['strides'])) + if 'params' in op and op['params'] and op['params'] != '----': + debug_annots.append(make_debug_annotation("params", string_val=op['params'])) + + # Slice Begin + evt_begin = make_track_event(1, 2, name=f"{op['name']} ({op['dims']})", category="operator", debug_annotations=debug_annots) + packet_begin = make_trace_packet(op_start_ns, track_event=evt_begin) + write_trace_packet_to_file(f, packet_begin) + + # Slice End + evt_end = make_track_event(2, 2) + packet_end = make_trace_packet(op_start_ns + clamped_dur, track_event=evt_end) + write_trace_packet_to_file(f, packet_end) + + last_op_end_ns = op_start_ns + clamped_dur + + # Emit Thread Trace Events + for e in completed_events: + norm_name = normalize_event_name(e['event']) + name = f"DMA {e['info']}" if norm_name == "DMA" else norm_name + + # Slice Begin + evt_begin = make_track_event(1, e['uuid'], name=name, category="trace") + packet_begin = make_trace_packet(e['ts_ns'], track_event=evt_begin) + write_trace_packet_to_file(f, packet_begin) + + # Slice End + evt_end = make_track_event(2, e['uuid']) + packet_end = make_trace_packet(e['ts_ns'] + e['dur_ns'], track_event=evt_end) + write_trace_packet_to_file(f, packet_end) + + logger.info(f"Successfully generated Perfetto trace at {output_path}") + + +def main(): + parser = argparse.ArgumentParser(description="Convert Hexagon Op profile logs to native Perfetto Protobuf traces.") + parser.add_argument("logfile", help="Path to hex-log profile file") + parser.add_argument("-o", "--output", default="optrace.perfetto-trace", help="Output trace file path (default: optrace.perfetto-trace)") + parser.add_argument("--filter", type=str, help="Regex filter matching against the original profile-op line") + + group = parser.add_mutually_exclusive_group() + group.add_argument("--head", type=int, help="Limit to first N ops") + group.add_argument("--tail", type=int, help="Limit to last N ops") + + args = parser.parse_args() + logging.basicConfig(level=logging.INFO, format='%(message)s') + + ops = parse_log(args.logfile) + + if args.filter: + try: + filter_re = re.compile(args.filter) + except re.error as e: + logger.error(f"Invalid regex filter: {e}") + sys.exit(1) + ops = [op for op in ops if filter_re.search(op['op_text'])] + + if args.head is not None: + ops = ops[:args.head] + elif args.tail is not None: + ops = ops[-args.tail:] + + generate_perfetto_trace(ops, args.output) + + +if __name__ == "__main__": + main() diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index 6e1bf3a1f4b9..27bab1a8ea66 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -7142aa6bf9fcaeec0fef8d80fcd90afe4268adf1 +eced84c86f8b012c752c016f7fe789adea168e1e diff --git a/scripts/sync_vendor.py b/scripts/sync_vendor.py index 8306cf93ec5e..f913b0c7dc2a 100755 --- a/scripts/sync_vendor.py +++ b/scripts/sync_vendor.py @@ -5,7 +5,7 @@ import sys import subprocess -HTTPLIB_VERSION = "refs/tags/v0.46.1" +HTTPLIB_VERSION = "refs/tags/v0.48.0" vendor = { "https://github.com/nlohmann/json/releases/latest/download/json.hpp": "vendor/nlohmann/json.hpp", diff --git a/scripts/ui-assets.cmake b/scripts/ui-assets.cmake index f85c562bd0e5..349fa9bf8113 100644 --- a/scripts/ui-assets.cmake +++ b/scripts/ui-assets.cmake @@ -4,8 +4,9 @@ # 1. Pre-built assets in SRC_DIST_DIR (manually built by user) # 2. If BUILD_UI=ON: npm build # 3. If above did not produce assets and HF_ENABLED=ON: HF Bucket download +# of dist.tar.gz (verified against dist.tar.gz.sha256) -cmake_minimum_required(VERSION 3.16) +cmake_minimum_required(VERSION 3.18) set(UI_SOURCE_DIR "" CACHE STRING "UI source directory (to run npm build)") set(UI_BINARY_DIR "" CACHE STRING "UI binary directory (to store generated files)") @@ -15,56 +16,19 @@ set(HF_VERSION "" CACHE STRING "Version to download (empty = resolve from set(HF_ENABLED "" CACHE STRING "Whether to allow HF Bucket download (ON/OFF)") set(BUILD_UI "" CACHE STRING "Build UI via npm (ON/OFF)") set(LLAMA_UI_EMBED "" CACHE STRING "Path to llama-ui-embed helper") - -set(ASSETS - bundle.css - bundle.js - index.html - loading.html -) +set(LLAMA_UI_GZIP "" CACHE STRING "Apply gzip compress to assets to save bandwidth") set(DIST_DIR "${UI_BINARY_DIR}/dist") set(SRC_DIST_DIR "${UI_SOURCE_DIR}/dist") +set(WORK_DIR "${UI_BINARY_DIR}/ui-src") set(STAMP_FILE "${UI_BINARY_DIR}/.ui-stamp") set(UI_CPP "${UI_BINARY_DIR}/ui.cpp") set(UI_H "${UI_BINARY_DIR}/ui.h") -function(assets_present out_var) - set(present TRUE) - foreach(asset ${ASSETS}) - if(NOT EXISTS "${DIST_DIR}/${asset}") - set(present FALSE) - break() - endif() - endforeach() - set(${out_var} ${present} PARENT_SCOPE) -endfunction() - -function(copy_src_dist out_var) - set(${out_var} FALSE PARENT_SCOPE) - - foreach(asset ${ASSETS}) - if(NOT EXISTS "${SRC_DIST_DIR}/${asset}") - return() - endif() - endforeach() - - file(MAKE_DIRECTORY "${DIST_DIR}") - message(STATUS "UI: using pre-built assets from ${SRC_DIST_DIR}") - foreach(asset ${ASSETS}) - execute_process( - COMMAND ${CMAKE_COMMAND} -E copy_if_different - "${SRC_DIST_DIR}/${asset}" "${DIST_DIR}/${asset}" - ) - endforeach() - set(${out_var} TRUE PARENT_SCOPE) -endfunction() - function(npm_build_should_skip out_var) set(${out_var} FALSE PARENT_SCOPE) - assets_present(present) - if(NOT present) + if(NOT EXISTS "${DIST_DIR}/index.html") return() endif() @@ -101,6 +65,22 @@ function(npm_build_should_skip out_var) set(${out_var} TRUE PARENT_SCOPE) endfunction() +function(stage_sources) + if(EXISTS "${WORK_DIR}") + file(GLOB staged RELATIVE "${WORK_DIR}" "${WORK_DIR}/*") + list(REMOVE_ITEM staged "node_modules") + foreach(entry ${staged}) + file(REMOVE_RECURSE "${WORK_DIR}/${entry}") + endforeach() + endif() + + file(COPY "${UI_SOURCE_DIR}/" + DESTINATION "${WORK_DIR}" + NO_SOURCE_PERMISSIONS + PATTERN "node_modules" EXCLUDE + ) +endfunction() + function(npm_build out_var) set(${out_var} FALSE PARENT_SCOPE) @@ -126,14 +106,16 @@ function(npm_build out_var) return() endif() + stage_sources() + # npm writes node_modules/.package-lock.json on every successful install, # so a package-lock.json newer than this marker means node_modules is stale - set(NPM_MARKER "${UI_SOURCE_DIR}/node_modules/.package-lock.json") + set(NPM_MARKER "${WORK_DIR}/node_modules/.package-lock.json") set(need_install FALSE) if(NOT EXISTS "${NPM_MARKER}") set(need_install TRUE) else() - file(TIMESTAMP "${UI_SOURCE_DIR}/package-lock.json" lock_ts) + file(TIMESTAMP "${WORK_DIR}/package-lock.json" lock_ts) file(TIMESTAMP "${NPM_MARKER}" marker_ts) if(lock_ts STRGREATER marker_ts) set(need_install TRUE) @@ -144,7 +126,7 @@ function(npm_build out_var) message(STATUS "UI: running npm install") execute_process( COMMAND ${NPM_EXECUTABLE} install - WORKING_DIRECTORY "${UI_SOURCE_DIR}" + WORKING_DIRECTORY "${WORK_DIR}" RESULT_VARIABLE rc ERROR_VARIABLE err ) @@ -159,9 +141,9 @@ function(npm_build out_var) message(STATUS "UI: running npm run build, output -> ${DIST_DIR}") execute_process( - COMMAND ${CMAKE_COMMAND} -E env "LLAMA_UI_OUT_DIR=${DIST_DIR}" + COMMAND ${CMAKE_COMMAND} -E env "LLAMA_UI_OUT_DIR=${DIST_DIR}" "LLAMA_UI_VERSION=${HF_VERSION}" "LLAMA_BUILD_NUMBER=${LLAMA_BUILD_NUMBER}" ${NPM_EXECUTABLE} run build - WORKING_DIRECTORY "${UI_SOURCE_DIR}" + WORKING_DIRECTORY "${WORK_DIR}" RESULT_VARIABLE rc ERROR_VARIABLE err ) @@ -171,8 +153,7 @@ function(npm_build out_var) return() endif() - assets_present(present) - if(NOT present) + if(NOT EXISTS "${DIST_DIR}/index.html") message(STATUS "UI: npm build finished but assets missing in ${DIST_DIR}") return() endif() @@ -203,7 +184,7 @@ function(hf_download version out_var out_resolved) set(${out_var} FALSE PARENT_SCOPE) set(${out_resolved} "" PARENT_SCOPE) - file(MAKE_DIRECTORY "${DIST_DIR}") + set(archive "${UI_BINARY_DIR}/dist.tar.gz") set(candidates "") if(NOT "${version}" STREQUAL "") @@ -212,68 +193,88 @@ function(hf_download version out_var out_resolved) list(APPEND candidates "latest") foreach(resolved ${candidates}) - set(base "https://huggingface.co/buckets/ggml-org/${HF_BUCKET}/resolve/${resolved}") - - message(STATUS "UI: downloading from ${resolved}: ${base}") - - set(ok TRUE) - foreach(asset ${ASSETS}) - file(DOWNLOAD "${base}/${asset}?download=true" "${DIST_DIR}/${asset}" - STATUS status TIMEOUT 60 - ) - list(GET status 0 rc) - if(NOT rc EQUAL 0) - list(GET status 1 errmsg) - message(STATUS "UI: download ${asset} from ${resolved} failed: ${errmsg}") - set(ok FALSE) - break() - endif() - message(STATUS "UI: downloaded ${asset}") - endforeach() + set(base "https://huggingface.co/buckets/${HF_BUCKET}/resolve/${resolved}") - if(NOT ok) + message(STATUS "UI: downloading from ${resolved}: ${base}/dist.tar.gz") + + file(DOWNLOAD "${base}/dist.tar.gz?download=true" "${archive}" + STATUS status TIMEOUT 300 + ) + list(GET status 0 rc) + if(NOT rc EQUAL 0) + list(GET status 1 errmsg) + message(STATUS "UI: download dist.tar.gz from ${resolved} failed: ${errmsg}") continue() endif() - # Best-effort checksum verification - file(DOWNLOAD "${base}/checksums.txt?download=true" "${DIST_DIR}/checksums.txt" - STATUS cs_status TIMEOUT 30 + file(DOWNLOAD "${base}/dist.tar.gz.sha256?download=true" "${archive}.sha256" + STATUS status TIMEOUT 30 ) - list(GET cs_status 0 cs_rc) - if(cs_rc EQUAL 0) - message(STATUS "UI: verifying checksums") - file(STRINGS "${DIST_DIR}/checksums.txt" cs_lines) - foreach(asset ${ASSETS}) - file(SHA256 "${DIST_DIR}/${asset}" h) - string(TOLOWER "${h}" h) - string(REGEX MATCH "${h}[ \t]+${asset}" m "${cs_lines}") - if(NOT m) - message(WARNING "UI: checksum verification failed for ${asset}") - set(ok FALSE) - break() - endif() - endforeach() - if(ok) - message(STATUS "UI: all checksums verified") - endif() + list(GET status 0 rc) + if(NOT rc EQUAL 0) + list(GET status 1 errmsg) + message(STATUS "UI: download dist.tar.gz.sha256 from ${resolved} failed: ${errmsg}") + continue() endif() - if(ok) - set(${out_var} TRUE PARENT_SCOPE) - set(${out_resolved} "${resolved}" PARENT_SCOPE) - return() + # Validate sha256 checkums + file(READ "${archive}.sha256" expected) + string(REGEX MATCH "^[0-9a-fA-F]+" expected "${expected}") + string(TOLOWER "${expected}" expected) + file(SHA256 "${archive}" actual) + if("${expected}" STREQUAL "" OR NOT "${actual}" STREQUAL "${expected}") + message(STATUS "UI: checksum mismatch for dist.tar.gz from ${resolved}") + continue() + endif() + + # Clear DIST_DIR to remove stale files first + file(REMOVE_RECURSE "${DIST_DIR}") + + file(ARCHIVE_EXTRACT INPUT "${archive}" DESTINATION "${DIST_DIR}") + + if(NOT EXISTS "${DIST_DIR}/index.html") + message(STATUS "UI: archive from ${resolved} is missing required assets") + continue() endif() + + message(STATUS "UI: archive verified and extracted") + set(${out_var} TRUE PARENT_SCOPE) + set(${out_resolved} "${resolved}" PARENT_SCOPE) + return() endforeach() endfunction() -function(emit_files) - assets_present(present) +function(emit_files dist_dir) + # If gzip is requested, compress every asset into a parallel _gzip/ tree + # the structure stays the same; for ex: /abc/def --> /_gzip/abc/def + # embed.cpp will check for _gzip and will pick it up + if(LLAMA_UI_GZIP AND EXISTS "${dist_dir}/index.html") + find_program(GZIP_EXECUTABLE gzip) + if(NOT GZIP_EXECUTABLE) + message(WARNING "UI: LLAMA_UI_GZIP requested but gzip not found, embedding uncompressed") + else() + set(gzip_dir "${dist_dir}/_gzip") + file(REMOVE_RECURSE "${gzip_dir}") + file(GLOB_RECURSE all_files RELATIVE "${dist_dir}" "${dist_dir}/*") + foreach(f ${all_files}) + get_filename_component(dst_dir "${gzip_dir}/${f}" DIRECTORY) + file(MAKE_DIRECTORY "${dst_dir}") + execute_process( + COMMAND "${GZIP_EXECUTABLE}" -c "${dist_dir}/${f}" + OUTPUT_FILE "${gzip_dir}/${f}" + RESULT_VARIABLE gz_rc + ) + if(NOT gz_rc EQUAL 0) + message(FATAL_ERROR "UI: gzip failed for ${f}") + endif() + endforeach() + message(STATUS "UI: gzip compression applied (${gzip_dir})") + endif() + endif() set(args "${UI_CPP}" "${UI_H}") - if(present) - foreach(asset ${ASSETS}) - list(APPEND args "${asset}" "${DIST_DIR}/${asset}") - endforeach() + if(EXISTS "${dist_dir}/index.html") + list(APPEND args "${dist_dir}") endif() execute_process( @@ -288,9 +289,9 @@ endfunction() # --------------------------------------------------------------------------- # 1. Priority 1: pre-built assets supplied in tools/ui/dist # --------------------------------------------------------------------------- -copy_src_dist(SRC_OK) -if(SRC_OK) - emit_files() +if(EXISTS "${SRC_DIST_DIR}/index.html") + message(STATUS "UI: using pre-built assets from ${SRC_DIST_DIR}") + emit_files("${SRC_DIST_DIR}") return() endif() @@ -300,6 +301,8 @@ endif() set(provisioned FALSE) if(BUILD_UI) + # Resolve version from git build-info if not explicitly set + resolve_version(HF_VERSION) npm_build(NPM_OK) if(NPM_OK) set(provisioned TRUE) @@ -321,7 +324,10 @@ if(NOT provisioned AND HF_ENABLED) endif() endif() - assets_present(have_assets) + set(have_assets FALSE) + if(EXISTS "${DIST_DIR}/index.html") + set(have_assets TRUE) + endif() if(stamp_ok AND have_assets) message(STATUS "UI: HF stamp '${stamped}' matches version, skipping HF fetch") set(provisioned TRUE) @@ -341,8 +347,7 @@ endif() # 4. Fallback: warn about stale or missing assets, then emit whatever we have # --------------------------------------------------------------------------- if(NOT provisioned) - assets_present(have_assets) - if(have_assets) + if(EXISTS "${DIST_DIR}/index.html") message(WARNING "UI: provisioning failed; embedding stale assets from ${DIST_DIR}") else() message(WARNING "UI: no assets available - building without an embedded UI. " @@ -353,4 +358,4 @@ if(NOT provisioned) endif() endif() -emit_files() +emit_files("${DIST_DIR}") diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 680b5fc64df3..4a52d977297c 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -3,7 +3,6 @@ #include "llama-impl.h" #include -#include #include static const std::map LLM_ARCH_NAMES = { @@ -67,6 +66,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_XVERSE, "xverse" }, { LLM_ARCH_COMMAND_R, "command-r" }, { LLM_ARCH_COHERE2, "cohere2" }, + { LLM_ARCH_COHERE2MOE, "cohere2moe" }, { LLM_ARCH_DBRX, "dbrx" }, { LLM_ARCH_OLMO, "olmo" }, { LLM_ARCH_OLMO2, "olmo2" }, @@ -128,6 +128,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_RND1, "rnd1" }, { LLM_ARCH_PANGU_EMBED, "pangu-embedded" }, { LLM_ARCH_MISTRAL3, "mistral3" }, + { LLM_ARCH_EAGLE3, "eagle3" }, { LLM_ARCH_MISTRAL4, "mistral4" }, { LLM_ARCH_PADDLEOCR, "paddleocr" }, { LLM_ARCH_MIMO2, "mimo2" }, @@ -292,46 +293,51 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_CLASSIFIER_OUTPUT_LABELS, "%s.classifier.output_labels" }, + { LLM_KV_TARGET_LAYERS, "%s.target_layers" }, + { LLM_KV_TARGET_HIDDEN_SIZE, "%s.target_hidden_size" }, + { LLM_KV_NORM_BEFORE_RESIDUAL, "%s.norm_before_residual" }, + { LLM_KV_SHORTCONV_L_CACHE, "%s.shortconv.l_cache" }, // sentence-transformers dense modules feature dims { LLM_KV_DENSE_2_FEAT_IN, "%s.dense_2_feat_in" }, - { LLM_KV_DENSE_2_FEAT_OUT, "%s.dense_2_feat_out" }, - { LLM_KV_DENSE_3_FEAT_IN, "%s.dense_3_feat_in" }, - { LLM_KV_DENSE_3_FEAT_OUT, "%s.dense_3_feat_out" }, - - { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" }, - { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" }, - { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" }, - { LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" }, - { LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" }, - { LLM_KV_TOKENIZER_SCORES, "tokenizer.ggml.scores" }, - { LLM_KV_TOKENIZER_MERGES, "tokenizer.ggml.merges" }, - { LLM_KV_TOKENIZER_BOS_ID, "tokenizer.ggml.bos_token_id" }, - { LLM_KV_TOKENIZER_EOS_ID, "tokenizer.ggml.eos_token_id" }, - { LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" }, - { LLM_KV_TOKENIZER_EOM_ID, "tokenizer.ggml.eom_token_id" }, - { LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" }, - { LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" }, - { LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" }, - { LLM_KV_TOKENIZER_CLS_ID, "tokenizer.ggml.cls_token_id" }, - { LLM_KV_TOKENIZER_MASK_ID, "tokenizer.ggml.mask_token_id" }, - { LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" }, - { LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" }, - { LLM_KV_TOKENIZER_ADD_SEP, "tokenizer.ggml.add_sep_token" }, - { LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" }, - { LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, "tokenizer.ggml.remove_extra_whitespaces" }, - { LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap" }, - { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" }, - { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" }, - { LLM_KV_TOKENIZER_CHAT_TEMPLATE, "tokenizer.chat_template" }, - { LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, "tokenizer.ggml.normalizer.lowercase" }, - { LLM_KV_TOKENIZER_FIM_PRE_ID, "tokenizer.ggml.fim_pre_token_id" }, - { LLM_KV_TOKENIZER_FIM_SUF_ID, "tokenizer.ggml.fim_suf_token_id" }, - { LLM_KV_TOKENIZER_FIM_MID_ID, "tokenizer.ggml.fim_mid_token_id" }, - { LLM_KV_TOKENIZER_FIM_PAD_ID, "tokenizer.ggml.fim_pad_token_id" }, - { LLM_KV_TOKENIZER_FIM_REP_ID, "tokenizer.ggml.fim_rep_token_id" }, - { LLM_KV_TOKENIZER_FIM_SEP_ID, "tokenizer.ggml.fim_sep_token_id" }, - { LLM_KV_TOKENIZER_SUPPRESS_TOKENS, "tokenizer.ggml.suppress_tokens" }, + { LLM_KV_DENSE_2_FEAT_OUT, "%s.dense_2_feat_out" }, + { LLM_KV_DENSE_3_FEAT_IN, "%s.dense_3_feat_in" }, + { LLM_KV_DENSE_3_FEAT_OUT, "%s.dense_3_feat_out" }, + + { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" }, + { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" }, + { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" }, + { LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" }, + { LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" }, + { LLM_KV_TOKENIZER_SCORES, "tokenizer.ggml.scores" }, + { LLM_KV_TOKENIZER_MERGES, "tokenizer.ggml.merges" }, + { LLM_KV_TOKENIZER_BOS_ID, "tokenizer.ggml.bos_token_id" }, + { LLM_KV_TOKENIZER_EOS_ID, "tokenizer.ggml.eos_token_id" }, + { LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" }, + { LLM_KV_TOKENIZER_EOM_ID, "tokenizer.ggml.eom_token_id" }, + { LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" }, + { LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" }, + { LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" }, + { LLM_KV_TOKENIZER_CLS_ID, "tokenizer.ggml.cls_token_id" }, + { LLM_KV_TOKENIZER_MASK_ID, "tokenizer.ggml.mask_token_id" }, + { LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" }, + { LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" }, + { LLM_KV_TOKENIZER_ADD_SEP, "tokenizer.ggml.add_sep_token" }, + { LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" }, + { LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, "tokenizer.ggml.remove_extra_whitespaces" }, + { LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap" }, + { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" }, + { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" }, + { LLM_KV_TOKENIZER_CHAT_TEMPLATE, "tokenizer.chat_template" }, + { LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, "tokenizer.ggml.normalizer.lowercase" }, + { LLM_KV_TOKENIZER_NORMALIZER_STRIP_ACCENTS, "tokenizer.ggml.normalizer.strip_accents" }, + { LLM_KV_TOKENIZER_FIM_PRE_ID, "tokenizer.ggml.fim_pre_token_id" }, + { LLM_KV_TOKENIZER_FIM_SUF_ID, "tokenizer.ggml.fim_suf_token_id" }, + { LLM_KV_TOKENIZER_FIM_MID_ID, "tokenizer.ggml.fim_mid_token_id" }, + { LLM_KV_TOKENIZER_FIM_PAD_ID, "tokenizer.ggml.fim_pad_token_id" }, + { LLM_KV_TOKENIZER_FIM_REP_ID, "tokenizer.ggml.fim_rep_token_id" }, + { LLM_KV_TOKENIZER_FIM_SEP_ID, "tokenizer.ggml.fim_sep_token_id" }, + { LLM_KV_TOKENIZER_SUPPRESS_TOKENS, "tokenizer.ggml.suppress_tokens" }, { LLM_KV_ADAPTER_TYPE, "adapter.type" }, { LLM_KV_ADAPTER_LORA_ALPHA, "adapter.lora.alpha" }, @@ -561,6 +567,8 @@ static const std::map LLM_TENSOR_NAMES = { { LLM_TENSOR_INDEXER_ATTN_Q_B, "blk.%d.indexer.attn_q_b" }, { LLM_TENSOR_MASKED_EMBD_CENTROIDS, "masked_embd_centroids" }, { LLM_TENSOR_MASKED_EMBD_ORDERING, "masked_embd_ordering" }, + { LLM_TENSOR_FC, "fc" }, + { LLM_TENSOR_D2T, "d2t" }, }; // declare information about the model weight tensors: @@ -787,6 +795,9 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_FFN_LATENT_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_MASKED_EMBD_CENTROIDS, {LLM_TENSOR_LAYER_INPUT, GGML_OP_NONE}}, {LLM_TENSOR_MASKED_EMBD_ORDERING, {LLM_TENSOR_LAYER_INPUT, GGML_OP_NONE}}, + // eagle3 + {LLM_TENSOR_FC, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_D2T, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_GET_ROWS}}, }; LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {} diff --git a/src/llama-arch.h b/src/llama-arch.h index b65fce72e646..989da06d8d51 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -71,6 +71,7 @@ enum llm_arch { LLM_ARCH_XVERSE, LLM_ARCH_COMMAND_R, LLM_ARCH_COHERE2, + LLM_ARCH_COHERE2MOE, LLM_ARCH_DBRX, LLM_ARCH_OLMO, LLM_ARCH_OLMO2, @@ -141,6 +142,7 @@ enum llm_arch { LLM_ARCH_KIMI_LINEAR, LLM_ARCH_TALKIE, LLM_ARCH_MELLUM, + LLM_ARCH_EAGLE3, LLM_ARCH_UNKNOWN, }; @@ -314,6 +316,7 @@ enum llm_kv { LLM_KV_TOKENIZER_RWKV, LLM_KV_TOKENIZER_CHAT_TEMPLATE, LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, + LLM_KV_TOKENIZER_NORMALIZER_STRIP_ACCENTS, LLM_KV_TOKENIZER_FIM_PRE_ID, LLM_KV_TOKENIZER_FIM_SUF_ID, LLM_KV_TOKENIZER_FIM_MID_ID, @@ -336,6 +339,10 @@ enum llm_kv { LLM_KV_CLASSIFIER_OUTPUT_LABELS, + LLM_KV_TARGET_LAYERS, + LLM_KV_TARGET_HIDDEN_SIZE, + LLM_KV_NORM_BEFORE_RESIDUAL, + LLM_KV_SHORTCONV_L_CACHE, LLM_KV_XIELU_ALPHA_N, @@ -568,6 +575,8 @@ enum llm_tensor { LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, LLM_TENSOR_MASKED_EMBD_CENTROIDS, LLM_TENSOR_MASKED_EMBD_ORDERING, + LLM_TENSOR_FC, + LLM_TENSOR_D2T, }; diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 9a40c4366af1..220240ea952b 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -71,6 +71,9 @@ llama_context::llama_context( cparams.no_perf = params.no_perf; cparams.warmup = false; + cparams.embeddings_layer_inp.resize(hparams.n_layer(), false); + embd_layer_inp.resize(hparams.n_layer()); + cparams.ctx_type = params.ctx_type; cparams.pooling_type = params.pooling_type; @@ -91,12 +94,21 @@ llama_context::llama_context( if (model.arch == LLM_ARCH_GEMMA4_ASSISTANT) { if (params.ctx_other == nullptr) { // TODO: change from runtime_error to llama_exception to avoid printing error message - throw std::runtime_error("Gemma4Assistant requires ctx_other to be set (this is normal during memory fitting)"); + throw std::runtime_error("Gemma4Assistant requires ctx_other to be set (this warning is normal during memory fitting)"); } cparams.ctx_other = params.ctx_other; } + if (model.arch == LLM_ARCH_EAGLE3) { + if (model.tok_embd == nullptr || model.output == nullptr) { + if (params.ctx_other == nullptr) { + throw std::runtime_error("EAGLE3 requires ctx_other to be set (this warning is normal during memory fitting)"); + } + cparams.ctx_other = params.ctx_other; + } + } + // Initialize backend samplers here so they are part of the sampling graph // before the reserve passes run later in this function. This avoids a later // re-reserve when graph nodes change. @@ -194,7 +206,7 @@ llama_context::llama_context( cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch); - cparams.n_outputs_max = params.n_outputs_max == 0 ? cparams.n_batch : params.n_outputs_max; + cparams.n_outputs_max = params.n_outputs_max == 0 || llama_model_has_encoder(&model) ? cparams.n_batch : params.n_outputs_max; cparams.op_offload = params.op_offload; cparams.kv_unified = params.kv_unified; @@ -938,6 +950,14 @@ float * llama_context::get_embeddings_nextn_ith(int32_t i) { } } +float * llama_context::get_embeddings_layer_inp(uint32_t lid) { + output_reorder(); + + GGML_ASSERT(lid < embd_layer_inp.size() && embd_layer_inp[lid].has_data()); + + return embd_layer_inp[lid].data; +} + llama_token llama_context::get_sampled_token_ith(int32_t idx) { output_reorder(); @@ -1125,6 +1145,21 @@ void llama_context::set_embeddings_nextn(bool value, bool masked) { cparams.embeddings_nextn_masked = masked; } +void llama_context::set_embeddings_layer_inp(uint32_t lid, bool enable) { + LLAMA_LOG_DEBUG("%s: lid = %d, enable = %d\n", __func__, lid, enable); + + GGML_ASSERT(lid < model.hparams.n_layer()); + + cparams.embeddings_layer_inp[lid] = enable; + + // note: without this reserve, the draft acceptance drops to zero. not sure why - this is unexpected + sched_need_reserve = true; +} + +void llama_context::set_nextn_layer_offset(int32_t offset) { + cparams.nextn_layer_offset = offset; +} + void llama_context::set_causal_attn(bool value) { LLAMA_LOG_DEBUG("%s: value = %d\n", __func__, value); @@ -1350,7 +1385,8 @@ int llama_context::encode(const llama_batch & batch_inp) { const auto & hparams = model.hparams; - const int64_t n_embd = hparams.n_embd_inp(); + // eagle3/DFlash: features as encoder input, and non-draft paths fall back to model's input dim + const int64_t n_embd = hparams.n_embd_inp_enc(); const int64_t n_vocab = model.vocab.n_tokens(); // note: during encode, we always pass the full sequence starting from pos = 0 @@ -1925,6 +1961,8 @@ int llama_context::decode(const llama_batch & batch_inp) { } } + extract_layer_inputs(res, n_tokens_prev, ubatch.n_tokens); + // extract nextn embeddings before // only meaningful in LLAMA_POOLING_TYPE_NONE (per-token); other pooling modes are ignored. { @@ -2029,6 +2067,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { const auto n_batch = cparams.n_batch; const auto n_vocab = vocab.n_tokens(); + const auto n_embd = hparams.n_embd; const auto n_embd_out = hparams.n_embd_out(); bool has_logits = true; @@ -2041,9 +2080,9 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { has_embd = true; } - size_t backend_float_count = 0; size_t backend_token_count = 0; + size_t embd_layer_inp_float_count = 0; logits.size = has_logits ? n_vocab*n_outputs_max : 0; embd.size = has_embd ? n_embd_out*n_outputs_max : 0; @@ -2055,6 +2094,12 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { embd_nextn.size = (size_t) n_embd_out * n_batch; } + for (bool enabled : cparams.embeddings_layer_inp) { + if (enabled) { + embd_layer_inp_float_count += (size_t) n_embd * n_batch; + } + } + // Allocate backend sampling output buffers if there are backend samplers configured. const bool has_sampling = !sampling.samplers.empty(); if (has_sampling) { @@ -2069,8 +2114,8 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { const size_t prev_size = buf_output ? ggml_backend_buffer_get_size(buf_output.get()) : 0; const size_t new_size = - (logits.size + embd.size + embd_nextn.size + backend_float_count) * sizeof(float) + - ( backend_token_count) * sizeof(llama_token); + (logits.size + embd.size + embd_nextn.size + embd_layer_inp_float_count + backend_float_count) * sizeof(float) + + ( backend_token_count) * sizeof(llama_token); // alloc only when more than the current capacity is required // TODO: also consider shrinking the buffer @@ -2087,6 +2132,9 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { logits.data = nullptr; embd.data = nullptr; embd_nextn.data = nullptr; + for (auto & layer_inp : embd_layer_inp) { + layer_inp = {nullptr, 0}; + } } auto * buft = ggml_backend_cpu_buffer_type(); @@ -2118,6 +2166,15 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { embd_nextn = has_embd_nextn ? buffer_view{(float *) (base + offset), embd_nextn.size} : buffer_view{nullptr, 0}; offset += embd_nextn.size * sizeof(float); + for (uint32_t il = 0; il < embd_layer_inp.size(); ++il) { + if (cparams.embeddings_layer_inp[il]) { + embd_layer_inp[il] = buffer_view{(float *) (base + offset), (size_t) n_embd * n_batch}; + offset += embd_layer_inp[il].size * sizeof(float); + } else { + embd_layer_inp[il] = buffer_view{nullptr, 0}; + } + } + if (has_sampling) { sampling.logits = {(float *) (base + offset), (size_t)(n_vocab*n_outputs_max)}; offset += sampling.logits.size * sizeof(float); @@ -2164,6 +2221,34 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) { return n_outputs_max; } +void llama_context::extract_layer_inputs(const llm_graph_result * res, size_t token_offset, size_t n_tokens) { + for (uint32_t il = 0; il < cparams.embeddings_layer_inp.size(); ++il) { + if (!cparams.embeddings_layer_inp[il]) { + continue; + } + if (!embd_layer_inp[il].has_data()) { + GGML_ABORT("output layer input buffer not allocated"); + } + ggml_tensor * t = res->get_layer_inp((int) il); + if (!t) { + GGML_ABORT("layer input tensor not found"); + } + + const size_t nbytes = ggml_nbytes(t); + const size_t nfloats = nbytes / sizeof(float); + GGML_ASSERT(n_tokens > 0); + GGML_ASSERT(nfloats % n_tokens == 0); + + const size_t row_floats = nfloats / n_tokens; + const size_t dst_offset = token_offset * row_floats; + GGML_ASSERT(dst_offset + nfloats <= embd_layer_inp[il].size); + + ggml_backend_t backend = ggml_backend_sched_get_tensor_backend(sched.get(), t); + GGML_ASSERT(backend != nullptr); + ggml_backend_tensor_get_async(backend, t, embd_layer_inp[il].data + dst_offset, 0, nbytes); + } +} + void llama_context::output_reorder() { const uint64_t n_vocab = model.vocab.n_tokens(); const uint64_t n_embd = model.hparams.n_embd; @@ -2190,6 +2275,16 @@ void llama_context::output_reorder() { } } + if (embd_layer_inp.size() > 0) { + for (int lid = 0; lid < (int) embd_layer_inp.size(); ++lid) { + if (embd_layer_inp[lid].size > 0) { + for (uint64_t k = 0; k < n_embd; ++k) { + std::swap(embd_layer_inp[lid].data[i0*n_embd + k], embd_layer_inp[lid].data[i1*n_embd + k]); + } + } + } + } + if (!sampling.samplers.empty()) { assert(sampling.logits.size > 0); assert(sampling.probs.size > 0); @@ -3604,6 +3699,14 @@ void llama_set_embeddings_nextn(llama_context * ctx, bool value, bool masked) { ctx->set_embeddings_nextn(value, masked); } +void llama_set_embeddings_layer_inp(llama_context * ctx, uint32_t lid, bool value) { + ctx->set_embeddings_layer_inp(lid, value); +} + +void llama_set_nextn_layer_offset(llama_context * ctx, int32_t offset) { + ctx->set_nextn_layer_offset(offset); +} + llama_memory_t llama_get_memory(const struct llama_context * ctx) { if (!ctx) { return nullptr; @@ -3624,6 +3727,12 @@ float * llama_get_embeddings_nextn_ith(llama_context * ctx, int32_t i) { return ctx->get_embeddings_nextn_ith(i); } +float * llama_get_embeddings_layer_inp(llama_context * ctx, uint32_t lid) { + ctx->synchronize(); + + return ctx->get_embeddings_layer_inp(lid); +} + bool llama_set_sampler(llama_context * ctx, llama_seq_id seq_id, llama_sampler * smpl) { return ctx->set_sampler(seq_id, smpl); } diff --git a/src/llama-context.h b/src/llama-context.h index 6f8f59a22a3e..f8b7805871ef 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -88,6 +88,8 @@ struct llama_context { float * get_embeddings_nextn(); float * get_embeddings_nextn_ith(int32_t i); + float * get_embeddings_layer_inp(uint32_t lid); + llama_token * get_sampled_tokens() const; llama_token get_sampled_token_ith(int32_t idx); @@ -112,6 +114,8 @@ struct llama_context { void set_embeddings (bool value); void set_embeddings_nextn(bool value, bool masked); + void set_embeddings_layer_inp(uint32_t lid, bool enable); + void set_nextn_layer_offset(int32_t offset); void set_causal_attn(bool value); void set_warmup(bool value); @@ -226,6 +230,10 @@ struct llama_context { // map the output row index `i` to batch index int64_t output_resolve_row(int32_t i) const; + // async-copy enabled layer-input tensors (per cparams.output_layer_inp) + // from backend into host-side embd_layer_inp buffers + void extract_layer_inputs(const llm_graph_result * res, size_t token_offset, size_t n_tokens); + // // graph // @@ -288,6 +296,10 @@ struct llama_context { // sets llm_graph_result::t_h_nextn buffer_view embd_nextn = {nullptr, 0}; + // host buffers for output layer input embeddings, per layer + // populated when cparams.output_layer_inp[il] is true + std::vector> embd_layer_inp; + struct sampling_info { // !samplers.empty() to check if any samplers are active std::map samplers; diff --git a/src/llama-cparams.h b/src/llama-cparams.h index 8a35d389ef40..546ae1e2c126 100644 --- a/src/llama-cparams.h +++ b/src/llama-cparams.h @@ -3,6 +3,7 @@ #include "llama.h" #include +#include #define LLAMA_MAX_SEQ 256 @@ -17,6 +18,8 @@ struct llama_cparams { int32_t n_threads; // number of threads to use for generation int32_t n_threads_batch; // number of threads to use for batch processing + int32_t nextn_layer_offset = 0; + float rope_freq_base; float rope_freq_scale; @@ -44,6 +47,8 @@ struct llama_cparams { bool kv_unified; bool pipeline_parallel; + std::vector embeddings_layer_inp; // [n_layer()] extract input embeddings for layer + enum llama_context_type ctx_type; enum llama_pooling_type pooling_type; diff --git a/src/llama-ext.h b/src/llama-ext.h index bd74544129b4..348bbae95770 100644 --- a/src/llama-ext.h +++ b/src/llama-ext.h @@ -2,6 +2,7 @@ // this is a staging header for new llama.cpp API // breaking changes and C++ are allowed. everything here should be considered WIP +// try as much as possible to not include this header in the rest of the codebase #include "llama.h" @@ -94,6 +95,11 @@ LLAMA_API llama_memory_breakdown llama_get_memory_breakdown(const struct llama_c // If masked == false, output the embeddings for all tokens in the batch regardless of batch.logits LLAMA_API void llama_set_embeddings_nextn(struct llama_context * ctx, bool value, bool masked); +// Select which appended NextN block the DECODER_MTP graph runs (offset past +// the trunk: il = n_layer() + offset). Used by the speculative NextN driver to +// chain multiple trained NextN heads. Default 0 (first head). +LLAMA_API void llama_set_nextn_layer_offset(struct llama_context * ctx, int32_t offset); + // mirrors: // LLAMA_API float * llama_get_embeddings(struct llama_context * ctx); LLAMA_API float * llama_get_embeddings_nextn(struct llama_context * ctx); @@ -101,4 +107,20 @@ LLAMA_API float * llama_get_embeddings_nextn(struct llama_context * ctx); // LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i); LLAMA_API float * llama_get_embeddings_nextn_ith(struct llama_context * ctx, int32_t i); +// Set whether the context outputs the input embeddings of a specific layer +LLAMA_API void llama_set_embeddings_layer_inp(struct llama_context * ctx, uint32_t lid, bool value); + +// mirrors: +// LLAMA_API float * llama_get_embeddings(struct llama_context * ctx); +LLAMA_API float * llama_get_embeddings_layer_inp(struct llama_context * ctx, uint32_t lid); + LLAMA_API llama_context * llama_get_ctx_other(struct llama_context * ctx); + +// +// model/context data extraction +// + +// returns pointer to the target-model layer indices +LLAMA_API const int32_t * llama_model_target_layer_ids (const struct llama_model * model); +// returns the number of extracted layers from target model +LLAMA_API uint32_t llama_model_target_layer_ids_n(const struct llama_model * model); diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 3d942ba4fe1d..68c9e606c3e3 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -904,6 +904,10 @@ void llm_graph_result::reset() { t_logits = nullptr; t_embd = nullptr; t_embd_pooled = nullptr; + + t_layer_inp.resize(LLAMA_MAX_LAYERS); + std::fill(t_layer_inp.begin(), t_layer_inp.end(), nullptr); + t_sampled.clear(); t_sampled_probs.clear(); t_sampled_logits.clear(); @@ -932,7 +936,7 @@ void llm_graph_result::set_inputs(const llama_ubatch * ubatch) { } } -void llm_graph_result::set_outputs() { +void llm_graph_result::set_outputs(const llm_graph_params & params) { if (t_logits != nullptr) { ggml_set_output(t_logits); } @@ -945,6 +949,15 @@ void llm_graph_result::set_outputs() { if (t_h_nextn != nullptr) { ggml_set_output(t_h_nextn); } + { + const auto & embeddings_layer_inp = params.cparams.embeddings_layer_inp; + for (size_t il = 0; il < embeddings_layer_inp.size(); ++il) { + if (embeddings_layer_inp[il]) { + GGML_ASSERT(t_layer_inp[il] != nullptr && "layer input tensor is null"); + ggml_set_output(t_layer_inp[il]); + } + } + } for (auto & [seq_id, t] : t_sampled) { if (t != nullptr) { ggml_set_output(t); @@ -1075,6 +1088,10 @@ ggml_tensor * llm_graph_context::build_lora_mm( ggml_tensor * w_s) const { ggml_tensor * res = ggml_mul_mat(ctx0, w, cur); + if (w_s) { + res = ggml_mul(ctx0, res, w_s); + } + for (const auto & lora : *loras) { llama_adapter_lora_weight * lw = lora.first->get_weight(w); if (lw == nullptr) { @@ -1093,18 +1110,24 @@ ggml_tensor * llm_graph_context::build_lora_mm( res = ggml_add(ctx0, res, ab_cur); } - if (w_s) { - res = ggml_mul(ctx0, res, w_s); - } - return res; } ggml_tensor * llm_graph_context::build_lora_mm_id( ggml_tensor * w, // ggml_tensor * as ggml_tensor * cur, // ggml_tensor * b - ggml_tensor * ids) const { + ggml_tensor * ids, + ggml_tensor * w_s) const { ggml_tensor * res = ggml_mul_mat_id(ctx0, w, cur, ids); + + if (w_s) { + const int64_t n_expert = w_s->ne[0]; + const int64_t n_tokens = cur->ne[2]; + ggml_tensor * s = ggml_reshape_3d(ctx0, w_s, 1, n_expert, 1); + s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1); + s = ggml_get_rows(ctx0, s, ids); + res = ggml_mul(ctx0, res, s); + } for (const auto & lora : *loras) { llama_adapter_lora_weight * lw = lora.first->get_weight(w); if (lw == nullptr) { @@ -1256,6 +1279,29 @@ ggml_tensor * llm_graph_context::build_ffn( llm_ffn_op_type type_op, llm_ffn_gate_type type_gate, int il) const { + // NVFP4 support is currently restricted to + // 1) LORA absence (*_s would be applied after LORA residual, which is incorrect) + // 2) bias absense (*_s would be applied after bias addition, which is incorrect) + // TODO: disambiguate LLM-architectural scales (which use *_s) from NVFP4 scale_2 (which also uses *_s currently) + auto has_lora = [this](ggml_tensor * w) { + if (!w) { + return false; + } + for (const auto & lora : *loras) { + if (lora.first->get_weight(w) != nullptr) { + return true; + } + } + return false; + }; + + GGML_ASSERT(!up_s || !up_b || !up || up->type != GGML_TYPE_NVFP4); + GGML_ASSERT(!gate_s || !gate_b || !gate || gate->type != GGML_TYPE_NVFP4); + GGML_ASSERT(!down_s || !down_b || !down || down->type != GGML_TYPE_NVFP4); + GGML_ASSERT(!up_s || !up || up->type != GGML_TYPE_NVFP4 || !has_lora(up)); + GGML_ASSERT(!gate_s || !gate || gate->type != GGML_TYPE_NVFP4 || !has_lora(gate)); + GGML_ASSERT(!down_s || !down || down->type != GGML_TYPE_NVFP4 || !has_lora(down)); + ggml_tensor * tmp = up ? build_lora_mm(up, cur) : cur; cb(tmp, "ffn_up", il); @@ -1614,23 +1660,18 @@ ggml_tensor * llm_graph_context::build_moe_ffn( if (gate_up_exps) { // merged gate_up path: one mul_mat_id, then split into gate and up views - ggml_tensor * gate_up = build_lora_mm_id(gate_up_exps, cur, selected_experts); // [n_ff*2, n_expert_used, n_tokens] + ggml_tensor * gate_up = build_lora_mm_id(gate_up_exps, cur, selected_experts, up_exps_s); // [n_ff*2, n_expert_used, n_tokens] cb(gate_up, "ffn_moe_gate_up", il); + if (up_exps_s) { + cb(gate_up, "ffn_moe_gate_up_scaled", il); + } + if (gate_up_exps_b) { gate_up = ggml_add_id(ctx0, gate_up, gate_up_exps_b, selected_experts); cb(gate_up, "ffn_moe_gate_up_biased", il); } - // apply per-expert scale2 to merged gate_up (use up_exps_s since gate and up are fused) - if (up_exps_s) { - ggml_tensor * s = ggml_reshape_3d(ctx0, up_exps_s, 1, n_expert, 1); - s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1); - s = ggml_get_rows(ctx0, s, selected_experts); // [1, n_expert_used, n_tokens] - gate_up = ggml_mul(ctx0, gate_up, s); - cb(gate_up, "ffn_moe_gate_up_scaled", il); - } - const int64_t n_ff = gate_up->ne[0] / 2; cur = ggml_view_3d(ctx0, gate_up, n_ff, gate_up->ne[1], gate_up->ne[2], gate_up->nb[1], gate_up->nb[2], 0); cb(cur, "ffn_moe_gate", il); @@ -1638,43 +1679,33 @@ ggml_tensor * llm_graph_context::build_moe_ffn( cb(up, "ffn_moe_up", il); } else { // separate gate and up path - up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] + up = build_lora_mm_id(up_exps, cur, selected_experts, up_exps_s); // [n_ff, n_expert_used, n_tokens] cb(up, "ffn_moe_up", il); + if (up_exps_s) { + cb(up, "ffn_moe_up_scaled", il); + } + if (up_exps_b) { up = ggml_add_id(ctx0, up, up_exps_b, selected_experts); cb(up, "ffn_moe_up_biased", il); } - // apply per-expert scale2 to up - if (up_exps_s) { - ggml_tensor * s = ggml_reshape_3d(ctx0, up_exps_s, 1, n_expert, 1); - s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1); - s = ggml_get_rows(ctx0, s, selected_experts); // [1, n_expert_used, n_tokens] - up = ggml_mul(ctx0, up, s); - cb(up, "ffn_moe_up_scaled", il); - } - if (gate_exps) { - cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] + cur = build_lora_mm_id(gate_exps, cur, selected_experts, gate_exps_s); // [n_ff, n_expert_used, n_tokens] cb(cur, "ffn_moe_gate", il); } else { cur = up; } + if (gate_exps_s) { + cb(cur, "ffn_moe_gate_scaled", il); + } + if (gate_exps_b) { cur = ggml_add_id(ctx0, cur, gate_exps_b, selected_experts); cb(cur, "ffn_moe_gate_biased", il); } - - // apply per-expert scale2 to gate - if (gate_exps_s) { - ggml_tensor * s = ggml_reshape_3d(ctx0, gate_exps_s, 1, n_expert, 1); - s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1); - s = ggml_get_rows(ctx0, s, selected_experts); // [1, n_expert_used, n_tokens] - cur = ggml_mul(ctx0, cur, s); - cb(cur, "ffn_moe_gate_scaled", il); - } } const bool has_gate = gate_exps || gate_up_exps; @@ -1746,23 +1777,18 @@ ggml_tensor * llm_graph_context::build_moe_ffn( GGML_ABORT("fatal error"); } - experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens] + experts = build_lora_mm_id(down_exps, cur, selected_experts, down_exps_s); // [n_embd, n_expert_used, n_tokens] cb(experts, "ffn_moe_down", il); + if (down_exps_s) { + cb(experts, "ffn_moe_down_scaled", il); + } + if (down_exps_b) { experts = ggml_add_id(ctx0, experts, down_exps_b, selected_experts); cb(experts, "ffn_moe_down_biased", il); } - // apply per-expert scale2 to down - if (down_exps_s) { - ggml_tensor * s = ggml_reshape_3d(ctx0, down_exps_s, 1, n_expert, 1); - s = ggml_repeat_4d(ctx0, s, 1, n_expert, n_tokens, 1); - s = ggml_get_rows(ctx0, s, selected_experts); // [1, n_expert_used, n_tokens] - experts = ggml_mul(ctx0, experts, s); - cb(experts, "ffn_moe_down_scaled", il); - } - if (!weight_before_ffn) { experts = ggml_mul(ctx0, experts, weights); cb(experts, "ffn_moe_weighted", il); diff --git a/src/llama-graph.h b/src/llama-graph.h index 6793846e3ea6..a6e8c3985ba5 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -682,9 +682,16 @@ struct llm_graph_params { } } + // TODO: https://github.com/ggml-org/llama.cpp/pull/24340#discussion_r3448035248 + if (cparams.nextn_layer_offset != other.cparams.nextn_layer_offset) { + return false; + } + return - cparams.embeddings == other.cparams.embeddings && - cparams.causal_attn == other.cparams.causal_attn && + cparams.embeddings == other.cparams.embeddings && + cparams.embeddings_nextn == other.cparams.embeddings_nextn && + cparams.embeddings_nextn_masked == other.cparams.embeddings_nextn_masked && + cparams.causal_attn == other.cparams.causal_attn && arch == other.arch && gtype == other.gtype && cvec == other.cvec && @@ -705,6 +712,8 @@ class llm_graph_result { ggml_tensor * get_embd_pooled() const { return t_embd_pooled; } ggml_tensor * get_h_nextn() const { return t_h_nextn; } + ggml_tensor * get_layer_inp(int il) const { return t_layer_inp[il]; } + ggml_cgraph * get_gf() const { return gf; } ggml_context * get_ctx() const { return ctx_compute.get(); } @@ -713,7 +722,7 @@ class llm_graph_result { void reset(); void set_inputs(const llama_ubatch * ubatch); - void set_outputs(); + void set_outputs(const llm_graph_params & params); // try to update the existing graph result using the new graph parameters in order to reuse it // this can only be done if we determine that the resulting graph using the new graph parameters @@ -734,10 +743,12 @@ class llm_graph_result { ggml_tensor * t_embd_pooled = nullptr; ggml_tensor * t_h_nextn = nullptr; // [n_embd, n_outputs] hidden state before final output norm - std::map t_sampled_logits; - std::map t_candidates; - std::map t_sampled; - std::map t_sampled_probs; + std::vector t_layer_inp; + + std::map t_sampled_logits; + std::map t_candidates; + std::map t_sampled; + std::map t_sampled_probs; std::vector inputs; @@ -849,11 +860,12 @@ struct llm_graph_context { ggml_tensor * cur, ggml_tensor * w_s = nullptr) const; - // do mat_mul_id, while optionally apply lora + // do mat_mul_id, while optionally apply lora and per-expert scale ggml_tensor * build_lora_mm_id( ggml_tensor * w, // ggml_tensor * as ggml_tensor * cur, // ggml_tensor * b - ggml_tensor * ids) const; + ggml_tensor * ids, + ggml_tensor * w_s = nullptr) const; ggml_tensor * build_norm( ggml_tensor * cur, diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp index 2bf576873824..9d0683d2fec4 100644 --- a/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp @@ -104,6 +104,10 @@ uint32_t llama_hparams::n_embd_inp() const { return n_embd_inp; } +uint32_t llama_hparams::n_embd_inp_enc() const { + return n_embd_inp_enc_impl > 0 ? n_embd_inp_enc_impl : n_embd_inp(); +} + uint32_t llama_hparams::n_embd_out() const { return n_embd_out_impl > 0 ? n_embd_out_impl : n_embd; } diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 032944cb481c..2eadeb214811 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -45,6 +45,7 @@ struct llama_hparams { bool rope_finetuned; bool use_par_res; bool swin_norm; + bool norm_before_residual = false; uint32_t n_ctx_train; // context size the model was trained on uint32_t n_embd; @@ -188,6 +189,10 @@ struct llama_hparams { // input embedding dimension (0 = use n_embd) uint32_t n_embd_inp_impl = 0; + // encoder input embedding dimension (0 = use n_embd_inp()) + // e.g. the eagle3 encoder fuses target_layers * target_hidden features + uint32_t n_embd_inp_enc_impl = 0; + // output embedding dimension (0 = use n_embd) uint32_t n_embd_out_impl = 0; @@ -304,6 +309,9 @@ struct llama_hparams { // dimension of main + auxiliary input embeddings uint32_t n_embd_inp() const; + // dimension of the encoder input embeddings + uint32_t n_embd_inp_enc() const; + // dimension of output embeddings uint32_t n_embd_out() const; diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 0d1cf3cc33bb..982a55f0d66b 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -37,6 +37,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_F16: return "F16"; case LLAMA_FTYPE_MOSTLY_BF16: return "BF16"; case LLAMA_FTYPE_MOSTLY_Q1_0: return "Q1_0"; + case LLAMA_FTYPE_MOSTLY_Q2_0: return "Q2_0"; case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0"; case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1"; case LLAMA_FTYPE_MOSTLY_Q5_0: return "Q5_0"; @@ -394,6 +395,7 @@ namespace GGUFMeta { template bool llama_model_loader::get_arr>(enum llm_kv kid, std::vector & result, bool required); template bool llama_model_loader::get_arr>(enum llm_kv kid, std::array & result, bool required); + template bool llama_model_loader::get_arr>(enum llm_kv kid, std::vector & result, bool required); template bool llama_model_loader::get_key(const std::string & key, T & result, bool required) { @@ -761,6 +763,7 @@ llama_model_loader::llama_model_loader( case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break; case GGML_TYPE_NVFP4: ftype = LLAMA_FTYPE_MOSTLY_NVFP4; break; case GGML_TYPE_Q1_0: ftype = LLAMA_FTYPE_MOSTLY_Q1_0; break; + case GGML_TYPE_Q2_0: ftype = LLAMA_FTYPE_MOSTLY_Q2_0; break; default: { LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max)); diff --git a/src/llama-model-saver.cpp b/src/llama-model-saver.cpp index 67d4a9df0f0e..a3928523ba8d 100644 --- a/src/llama-model-saver.cpp +++ b/src/llama-model-saver.cpp @@ -18,6 +18,7 @@ bool llama_model_saver_supports_arch(llm_arch arch) { case LLM_ARCH_GEMMA3: case LLM_ARCH_GEMMA3N: case LLM_ARCH_COHERE2: + case LLM_ARCH_COHERE2MOE: case LLM_ARCH_OLMO2: case LLM_ARCH_BITNET: case LLM_ARCH_T5: diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 4f12e0949acb..6cb0ec3791c2 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -157,6 +157,8 @@ static llama_model * llama_model_mapping(llm_arch arch, const llama_model_params return new llama_model_command_r(params); case LLM_ARCH_COHERE2: return new llama_model_cohere2(params); + case LLM_ARCH_COHERE2MOE: + return new llama_model_cohere2moe(params); case LLM_ARCH_DBRX: return new llama_model_dbrx(params); case LLM_ARCH_OLMO: @@ -287,6 +289,8 @@ static llama_model * llama_model_mapping(llm_arch arch, const llama_model_params return new llama_model_qwen35moe(params); case LLM_ARCH_MISTRAL3: return new llama_model_mistral3(params); + case LLM_ARCH_EAGLE3: + return new llama_model_eagle3(params); case LLM_ARCH_MIMO2: return new llama_model_mimo2(params); case LLM_ARCH_KIMI_LINEAR: @@ -696,6 +700,7 @@ const char * llm_type_name(llm_type type) { case LLM_TYPE_160M: return "160M"; case LLM_TYPE_190M: return "190M"; case LLM_TYPE_220M: return "220M"; + case LLM_TYPE_230M: return "230M"; case LLM_TYPE_250M: return "250M"; case LLM_TYPE_256M: return "256M"; case LLM_TYPE_270M: return "270M"; @@ -1465,9 +1470,12 @@ bool llama_model_base::load_tensors(llama_model_loader & ml) { } ml.done_getting_tensors(); + // Tied NVFP4 output is valid when no separate LM-head scale tensors are present. + // If sidecar scales exist, the output weight must be an actual output tensor. GGML_ASSERT(!(output && tok_embd && strcmp(output->name, tok_embd->name) == 0 && - output->type == GGML_TYPE_NVFP4)); + output->type == GGML_TYPE_NVFP4 && + (output_s || output_in_s))); // populate tensors_by_name for (auto & [_, ctx_ptr] : ml.ctx_map) { for (auto * cur = ggml_get_first_tensor(ctx_ptr.get()); cur != NULL; cur = ggml_get_next_tensor(ctx_ptr.get(), cur)) { @@ -1842,6 +1850,7 @@ void llama_model::print_info() const { } if (arch == LLM_ARCH_MELLUM || + arch == LLM_ARCH_COHERE2MOE || arch == LLM_ARCH_QWEN3MOE || arch == LLM_ARCH_OPENAI_MOE || arch == LLM_ARCH_QWEN3VLMOE || @@ -2238,7 +2247,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { // TODO: move reranking logic here and generalize llm->build_dense_out(dense_2_out_layers, dense_2_out_layers_b, dense_3_out_layers); - llm->res->set_outputs(); + llm->res->set_outputs(params); return llm->res->get_gf(); } @@ -2304,6 +2313,10 @@ int32_t llama_model_n_layer(const llama_model * model) { return model->hparams.n_layer(); } +int32_t llama_model_n_layer_nextn(const llama_model * model) { + return model->hparams.n_layer_nextn; +} + int32_t llama_model_n_head(const llama_model * model) { return model->hparams.n_head(); } @@ -2387,6 +2400,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_XVERSE: case LLM_ARCH_COMMAND_R: case LLM_ARCH_COHERE2: + case LLM_ARCH_COHERE2MOE: case LLM_ARCH_OLMO: case LLM_ARCH_ARCTIC: case LLM_ARCH_DEEPSEEK: @@ -2406,6 +2420,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_ERNIE4_5: case LLM_ARCH_ERNIE4_5_MOE: case LLM_ARCH_MISTRAL3: + case LLM_ARCH_EAGLE3: case LLM_ARCH_MISTRAL4: case LLM_ARCH_LLAMA_EMBED: case LLM_ARCH_MAINCODER: @@ -2600,8 +2615,9 @@ uint64_t llama_model_n_params(const llama_model * model) { bool llama_model_has_encoder(const llama_model * model) { switch (model->arch) { - case LLM_ARCH_T5: return true; - case LLM_ARCH_T5ENCODER: return true; + case LLM_ARCH_T5: + case LLM_ARCH_T5ENCODER: + case LLM_ARCH_EAGLE3: return true; default: return false; } } @@ -2687,3 +2703,12 @@ void llama_model_base::create_tensor_qkv(llama_layer & layer, int bid, layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", bid), {n_embd_v_}, TENSOR_NOT_REQUIRED); } } + +const int32_t * llama_model_target_layer_ids(const struct llama_model * model) { + const auto & v = model->target_layer_ids; + return v.empty() ? nullptr : v.data(); +} + +uint32_t llama_model_target_layer_ids_n(const struct llama_model * model) { + return (uint32_t) model->target_layer_ids.size(); +} diff --git a/src/llama-model.h b/src/llama-model.h index 992c8d9c8fd9..77d8d3b6258a 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -36,6 +36,7 @@ enum llm_type { LLM_TYPE_160M, LLM_TYPE_190M, LLM_TYPE_220M, + LLM_TYPE_230M, LLM_TYPE_250M, LLM_TYPE_256M, LLM_TYPE_270M, @@ -569,6 +570,13 @@ struct llama_model { struct ggml_tensor * per_layer_model_proj = nullptr; struct ggml_tensor * per_layer_proj_norm = nullptr; + // eagle3 + struct ggml_tensor * fc = nullptr; // feature fusion layer + struct ggml_tensor * d2t = nullptr; // draft to target vocabulary mapping + + // unified vector to store target-model extracted layer ids in eagle3, dflash, etc. + std::vector target_layer_ids; + std::vector layers; //Dense linear projections for SentenceTransformers models like embeddinggemma diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index cf92ce4bb8b7..aebbc1ffb6f1 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -380,6 +380,7 @@ static ggml_type tensor_type_fallback(quantize_state_impl & qs, const ggml_tenso case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ3_S: // types on the right: block size 32 case GGML_TYPE_IQ4_XS: return_type = GGML_TYPE_IQ4_NL; break; + case GGML_TYPE_Q2_0: case GGML_TYPE_Q2_K: case GGML_TYPE_Q3_K: case GGML_TYPE_TQ1_0: @@ -480,7 +481,7 @@ static ggml_type llama_tensor_get_type_impl(quantize_state_impl & qs, ggml_type else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { new_type = GGML_TYPE_IQ3_S; } - else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0) { + else if (ftype == LLAMA_FTYPE_MOSTLY_TQ1_0 || ftype == LLAMA_FTYPE_MOSTLY_TQ2_0 || ftype == LLAMA_FTYPE_MOSTLY_Q2_0) { new_type = GGML_TYPE_Q4_K; } } @@ -800,6 +801,7 @@ ggml_type llama_ftype_get_default_type(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_BF16: return GGML_TYPE_BF16; case LLAMA_FTYPE_ALL_F32: return GGML_TYPE_F32; case LLAMA_FTYPE_MOSTLY_Q1_0: return GGML_TYPE_Q1_0; + case LLAMA_FTYPE_MOSTLY_Q2_0: return GGML_TYPE_Q2_0; case LLAMA_FTYPE_MOSTLY_MXFP4_MOE: return GGML_TYPE_MXFP4; @@ -847,7 +849,7 @@ static void init_quantize_state_counters(quantize_state_impl & qs, std::vectordata[i].logit = -INFINITY; } } - - llama_sampler_softmax_impl(cur_p, true); } static struct llama_sampler * llama_sampler_top_n_sigma_clone(const struct llama_sampler * smpl) { diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 9a4bed49487b..6e78a3f6c0ea 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -764,7 +764,7 @@ struct llm_tokenizer_wpm_session { void tokenize(const std::string & text, std::vector & output) { // normalize and split by whitespace - std::vector words = preprocess(text, vocab.get_normalizer_lowercase()); + std::vector words = preprocess(text, vocab.get_normalizer_opts()); // bos token prepended already // find the longest tokens that form the words @@ -809,11 +809,14 @@ struct llm_tokenizer_wpm_session { } // TODO: reduce string copies by using cpts_offs array - static std::vector preprocess(const std::string & text, bool lowercase) { - const std::vector cpts_nfd = unicode_cpts_normalize_nfd(unicode_cpts_from_utf8(text)); + static std::vector preprocess(const std::string & text, const llama_vocab::normalizer_options & normalizer_opts) { + std::vector cpts = unicode_cpts_from_utf8(text); + if (normalizer_opts.strip_accents) { + cpts = unicode_cpts_normalize_nfd(cpts); + } std::vector words(1, ""); - for (const uint32_t cpt : cpts_nfd) { + for (const uint32_t cpt : cpts) { const auto flags = unicode_cpt_flags_from_cpt(cpt); if (flags.is_whitespace) { @@ -828,7 +831,11 @@ struct llm_tokenizer_wpm_session { continue; } - const std::string s = unicode_cpt_to_utf8(lowercase ? unicode_tolower(cpt) : cpt); + if (normalizer_opts.strip_accents && flags.is_accent_mark) { + continue; + } + + const std::string s = unicode_cpt_to_utf8(normalizer_opts.lowercase ? unicode_tolower(cpt) : cpt); if (flags.is_punctuation || ( cpt < 0x7F && flags.is_symbol ) || is_chinese_char(cpt)) { if (words.back().size()) { // finish previous word if any words.emplace_back(); @@ -1692,7 +1699,7 @@ struct llm_tokenizer_whitespace_session : llm_tokenizer_bpe_session { llm_tokenizer_whitespace_session(const llama_vocab & vocab, const llm_tokenizer_bpe & tokenizer) : llm_tokenizer_bpe_session{vocab, tokenizer}, vocab{vocab} {} void tokenize(const std::string & text, std::vector & output) override { - const bool lowercase = vocab.get_normalizer_lowercase(); + const bool lowercase = vocab.get_normalizer_opts().lowercase; std::string segment; auto flush = [&]() { @@ -1797,7 +1804,9 @@ struct llama_vocab::impl { bool remove_extra_whitespaces = false; bool escape_whitespaces = true; bool treat_whitespace_as_suffix = false; - bool normalizer_lowercase = true; // Lowercase normalizer (tokenizer.json) + + // BertNormalizer options + llama_vocab::normalizer_options normalizer_opts; std::unordered_map token_to_id; std::vector id_to_token; @@ -2172,7 +2181,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { } else if ( tokenizer_pre == "whitespace") { pre_type = LLAMA_VOCAB_PRE_TYPE_WHITESPACE; - normalizer_lowercase = false; + normalizer_opts.lowercase = false; } else if ( tokenizer_pre == "refact") { pre_type = LLAMA_VOCAB_PRE_TYPE_REFACT; @@ -2271,7 +2280,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { clean_spaces = false; ignore_merges = true; } else if ( - tokenizer_pre == "tiny_aya") { + tokenizer_pre == "tiny_aya" || + tokenizer_pre == "cohere2moe") { pre_type = LLAMA_VOCAB_PRE_TYPE_TINY_AYA; clean_spaces = false; } else if ( @@ -2532,8 +2542,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { } } - // Lowercase normalizer flag (consulted by WPM / whitespace BPE) - ml.get_key(LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, normalizer_lowercase, false); + // BertNormalizer options + ml.get_key(LLM_KV_TOKENIZER_NORMALIZER_LOWERCASE, normalizer_opts.lowercase, false); + normalizer_opts.strip_accents = normalizer_opts.lowercase; + ml.get_key(LLM_KV_TOKENIZER_NORMALIZER_STRIP_ACCENTS, normalizer_opts.strip_accents, false); // suppress tokens { @@ -3969,8 +3981,8 @@ bool llama_vocab::get_treat_whitespace_as_suffix() const { return pimpl->treat_whitespace_as_suffix; } -bool llama_vocab::get_normalizer_lowercase() const { - return pimpl->normalizer_lowercase; +const llama_vocab::normalizer_options & llama_vocab::get_normalizer_opts() const { + return pimpl->normalizer_opts; } const std::vector & llama_vocab::get_suppress_tokens() const { diff --git a/src/llama-vocab.h b/src/llama-vocab.h index 2626ae36e33f..707cd4bac4bf 100644 --- a/src/llama-vocab.h +++ b/src/llama-vocab.h @@ -76,6 +76,12 @@ struct llama_vocab { llama_token_attr attr; }; + struct normalizer_options { + bool lowercase = true; + bool strip_accents = true; + // TODO: clean_text, handle_chinese_chars + }; + llama_vocab(); ~llama_vocab(); @@ -141,7 +147,7 @@ struct llama_vocab { bool get_remove_extra_whitespaces () const; bool get_escape_whitespaces () const; bool get_treat_whitespace_as_suffix() const; - bool get_normalizer_lowercase () const; + const normalizer_options & get_normalizer_opts() const; const std::vector & get_suppress_tokens() const; diff --git a/src/llama.cpp b/src/llama.cpp index a67fa8039a48..0de6048f2820 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -249,7 +249,7 @@ static bool llama_prepare_model_devices(const llama_model_params & params, llama } // if using single GPU mode, remove all except the main GPU - if (params.split_mode == LLAMA_SPLIT_MODE_NONE) { + if (params.split_mode == LLAMA_SPLIT_MODE_NONE && !model->devices.empty()) { if (params.main_gpu < 0) { model->devices.clear(); } else { diff --git a/src/models/cohere2.cpp b/src/models/cohere2.cpp index 61a5945a1948..e2b3662560df 100644 --- a/src/models/cohere2.cpp +++ b/src/models/cohere2.cpp @@ -122,9 +122,9 @@ llama_model_cohere2::graph::graph(const llama_model & model, const llm_graph_par // feed-forward network { cur = build_ffn(ffn_inp, - model.layers[il].ffn_up, NULL, NULL, - model.layers[il].ffn_gate, NULL, NULL, - model.layers[il].ffn_down, NULL, NULL, + model.layers[il].ffn_up, NULL, model.layers[il].ffn_up_s, + model.layers[il].ffn_gate, NULL, model.layers[il].ffn_gate_s, + model.layers[il].ffn_down, NULL, model.layers[il].ffn_down_s, NULL, LLM_FFN_SILU, LLM_FFN_PAR, il); cb(cur, "ffn_out", il); } diff --git a/src/models/cohere2moe.cpp b/src/models/cohere2moe.cpp new file mode 100644 index 000000000000..499c73a1c49c --- /dev/null +++ b/src/models/cohere2moe.cpp @@ -0,0 +1,443 @@ +#include "models.h" + +void llama_model_cohere2moe::load_arch_hparams(llama_model_loader & ml) { + const bool found_norm = ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps, false); + const bool found_norm_rms = ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps, false); + if (!found_norm && !found_norm_rms) { + throw std::runtime_error("missing Cohere2 MoE norm epsilon"); + } + if (!found_norm_rms) { + hparams.f_norm_rms_eps = 0.0f; + } + + ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa); + ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale); + ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead); + ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); + ml.get_key(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_shexp, false); + ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared, false); + ml.get_key(LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm, false); + ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale, false); + ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false); + + ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.n_layer_nextn, false); + GGML_ASSERT(hparams.n_layer_nextn < hparams.n_layer_all && "n_layer_nextn must be < n_layer"); + + if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) { + hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID; + } + + hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; + uint32_t swa_period = 4; + if (ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false)) { + hparams.set_swa_pattern(swa_period, true); + } else { + ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, hparams.is_swa_impl, hparams.n_layer()); + } + + hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train; + hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train; + ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); + + switch (hparams.n_layer()) { + case 49: type = LLM_TYPE_30B_A3B; break; + default: type = LLM_TYPE_UNKNOWN; + } +} + +void llama_model_cohere2moe::load_arch_tensors(llama_model_loader & ml) { + LLAMA_LOAD_LOCALS; + + const bool mtp_only = (hparams.n_layer_nextn > 0) && (ml.get_weight("blk.0.attn_norm.weight") == nullptr); + // Trunk-only: the GGUF declares MTP layers in metadata but the actual MTP + // tensors live in a separate file. Mark MTP tensors NOT_REQUIRED so the + // trunk loads cleanly. + const std::string mtp_probe = "blk." + std::to_string(n_layer) + ".nextn.eh_proj.weight"; + const bool trunk_only = (hparams.n_layer_nextn > 0) && (ml.get_weight(mtp_probe.c_str()) == nullptr); + const int trunk_flags = mtp_only ? TENSOR_NOT_REQUIRED : 0; + const int mtp_flags = trunk_only ? TENSOR_NOT_REQUIRED : 0; + + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, 0); + + // output + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }, 0); + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED); + + // if output is NULL, init from the input tok embed + if (output == NULL) { + output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }, TENSOR_DUPLICATED); + } + + if (n_expert == 0) { + throw std::runtime_error("n_expert must be > 0 for Cohere2Moe"); + } + if (n_expert_used == 0) { + throw std::runtime_error("n_expert_used must be > 0 for Cohere2Moe"); + } + + auto load_block_trunk = [&](int i, int flags) { + auto & layer = layers[i]; + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, flags); + + create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_gqa, n_embd_gqa, flags); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, flags); + + if (static_cast(i) < hparams.n_layer_dense_lead) { + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), { n_embd, n_ff }, flags); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }, flags); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, n_ff }, flags); + } else { + const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff; + + layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, flags); + layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, flags); + create_tensor_gate_up_exps(layer, i, n_embd, n_ff_exp, n_expert, flags); + + if (hparams.n_expert_shared > 0) { + const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff_exp * hparams.n_expert_shared; + layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags); + layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_shexp, n_embd }, flags); + layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags); + } + } + }; + + auto load_block_mtp = [&](int i, int flags) { + auto & layer = layers[i]; + + // MTP block looks like a full-attention Cohere2 MoE decoder block. + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }, flags); + + create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head, n_embd_gqa, n_embd_gqa, flags); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, flags); + + const int64_t n_ff_exp = hparams.n_ff_exp ? hparams.n_ff_exp : n_ff; + + // Routed experts + layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), { n_embd, n_expert }, flags); + layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff_exp, n_embd, n_expert }, flags); + create_tensor_gate_up_exps(layer, i, n_embd, n_ff_exp, n_expert, flags); + + if (hparams.n_expert_shared > 0) { + const int64_t n_ff_shexp = hparams.n_ff_shexp ? hparams.n_ff_shexp : n_ff_exp * hparams.n_expert_shared; + + // Shared experts + layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags); + layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_shexp, n_embd }, flags); + layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), { n_embd, n_ff_shexp }, flags); + } + + // NextN-specific tensors that define the MTP block. + layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, flags); + layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, flags); + layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, flags); + layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED); + layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED); + layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, TENSOR_NOT_REQUIRED); + }; + + for (int i = 0; i < n_layer; ++i) { + load_block_trunk(i, trunk_flags); + } + // MTP/NextN layers are loaded as extra decoder blocks. + for (int i = n_layer; i < n_layer_all; ++i) { + load_block_mtp(i, mtp_flags); + } +} + +std::unique_ptr llama_model_cohere2moe::build_arch_graph(const llm_graph_params & params) const { + if (params.gtype == LLM_GRAPH_TYPE_DECODER_MTP) { + return std::make_unique(*this, params); + } + return std::make_unique(*this, params); +} + +llama_model_cohere2moe::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { + const int64_t n_embd_head = hparams.n_embd_head_v(); + + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_embd_head == n_rot); + + const llm_norm_type cohere2moe_norm_type = hparams.f_norm_rms_eps == 0.0f ? LLM_NORM : LLM_NORM_RMS; + const float f_logit_scale = hparams.f_logit_scale; + ggml_tensor * cur; + ggml_tensor * inpL = build_inp_embd(model.tok_embd); + ggml_tensor * inp_pos = build_inp_pos(); + + auto * inp_attn = build_attn_inp_kv_iswa(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + // MTP/NextN layers are loaded as extra decoder blocks but not executed in the main pass. + for (int il = 0; il < n_layer; ++il) { + const bool is_swa = hparams.is_swa(il); + // Dense-prefix full-attention layers use RoPE; later layers follow the SWA pattern. + const bool force_rope = static_cast(il) < hparams.n_layer_dense_lead; + + cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, cohere2moe_norm_type, il); + cb(cur, "attn_norm", il); + + ggml_tensor * ffn_inp = cur; + + { + const auto & layer = model.layers[il]; + + auto [Qcur, Kcur, Vcur] = build_qkv(layer, cur, + n_embd_head, n_head, n_head_kv, il); + + if (is_swa || force_rope) { + ggml_tensor * rope_factors = model.get_rope_factors(cparams, il); + + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + } + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + cur = build_attn(inp_attn, + layer.wo, layer.wo_b, layer.wo_s, + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, + 1.0f / sqrtf(float(n_embd_head)), il); + } + + if (il == n_layer - 1 && inp_out_ids && cparams.embeddings_nextn_masked) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids); + } + + ggml_tensor * attn_out = cur; + + const auto & layer = model.layers[il]; + + if (layer.ffn_gate_inp == nullptr) { + cur = build_ffn(ffn_inp, + layer.ffn_up, nullptr, layer.ffn_up_s, + layer.ffn_gate, nullptr, layer.ffn_gate_s, + layer.ffn_down, nullptr, layer.ffn_down_s, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + } else { + cur = build_moe_ffn(ffn_inp, + layer.ffn_gate_inp, + layer.ffn_up_exps, + layer.ffn_gate_exps, + layer.ffn_down_exps, + nullptr, + n_expert, n_expert_used, + LLM_FFN_SILU, hparams.expert_weights_norm, + hparams.expert_weights_scale, + (llama_expert_gating_func_type) hparams.expert_gating_func, + il, + nullptr, layer.ffn_gate_up_exps, + layer.ffn_up_exps_s, + layer.ffn_gate_exps_s, + layer.ffn_down_exps_s); + cb(cur, "ffn_moe_out", il); + + if (layer.ffn_up_shexp) { + ggml_tensor * ffn_shexp = build_ffn(ffn_inp, + layer.ffn_up_shexp, nullptr, layer.ffn_up_shexp_s, + layer.ffn_gate_shexp, nullptr, layer.ffn_gate_shexp_s, + layer.ffn_down_shexp, nullptr, layer.ffn_down_shexp_s, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(ffn_shexp, "ffn_shexp", il); + + cur = ggml_add(ctx0, cur, ffn_shexp); + cur = ggml_scale(ctx0, cur, 0.5f); + cb(cur, "ffn_out", il); + } + } + + cur = ggml_add(ctx0, cur, inpL); + cur = ggml_add(ctx0, cur, attn_out); + + cur = build_cvec(cur, il); + cb(cur, "l_out", il); + + inpL = cur; + } + + cur = inpL; + cur = build_norm(cur, model.output_norm, nullptr, cohere2moe_norm_type, -1); + + cb(cur, "h_nextn", -1); + res->t_h_nextn = cur; + + if (!cparams.embeddings_nextn_masked && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + } + + cb(cur, "result_norm", -1); + res->t_embd = cur; + + cur = build_lora_mm(model.output, cur); + + if (f_logit_scale) { + cur = ggml_scale(ctx0, cur, f_logit_scale); + } + + cb(cur, "result_output", -1); + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); +} + +llama_model_cohere2moe::graph_mtp::graph_mtp(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { + GGML_ASSERT(hparams.n_layer_nextn > 0 && "COHERE2MOE MTP requires n_layer_nextn > 0"); + GGML_ASSERT(hparams.n_layer_nextn == 1 && "COHERE2MOE MTP currently only supports a single MTP block"); + + const int64_t n_embd_head = hparams.n_embd_head_v(); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_embd_head == n_rot); + + const int il = hparams.n_layer(); + const auto & layer = model.layers[il]; + GGML_ASSERT(layer.nextn.eh_proj && "MTP block missing nextn.eh_proj"); + GGML_ASSERT(layer.nextn.enorm && "MTP block missing nextn.enorm"); + GGML_ASSERT(layer.nextn.hnorm && "MTP block missing nextn.hnorm"); + GGML_ASSERT(layer.ffn_gate_inp && "MTP block missing ffn_gate_inp"); + + const llm_norm_type cohere2moe_norm_type = hparams.f_norm_rms_eps == 0.0f ? LLM_NORM : LLM_NORM_RMS; + + // TODO: extract in a common llm_graph_context::build_inp_embd_h() + auto inp = std::make_unique(hparams.n_embd); + + inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + ggml_set_input(inp->tokens); + + inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd_inp(), n_tokens); + ggml_set_input(inp->embd); + + // TODO: make static using `ggml_build_forward_select()` + // see llm_graph_context::build_inp_embd() for reference + ggml_tensor * tok_embd; + if (ubatch.token) { + ggml_tensor * tok_embd_w = layer.nextn.embed_tokens ? layer.nextn.embed_tokens : model.tok_embd; + tok_embd = ggml_get_rows(ctx0, tok_embd_w, inp->tokens); + } else { + tok_embd = inp->embd; + } + cb(tok_embd, "mtp_tok_embd", il); + + inp->h = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd, n_tokens); + ggml_set_input(inp->h); + ggml_set_name(inp->h, "mtp_h_input"); + + ggml_tensor * h_embd = inp->h; + + res->add_input(std::move(inp)); + + ggml_tensor * inp_pos = build_inp_pos(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); + auto * inp_attn = build_attn_inp_kv_iswa(); + + ggml_tensor * h_norm = build_norm(h_embd, layer.nextn.hnorm, nullptr, cohere2moe_norm_type, il); + cb(h_norm, "mtp_hnorm", il); + + ggml_tensor * e_norm = build_norm(tok_embd, layer.nextn.enorm, nullptr, cohere2moe_norm_type, il); + cb(e_norm, "mtp_enorm", il); + + ggml_tensor * concat = ggml_concat(ctx0, e_norm, h_norm, /*dim=*/ 0); + cb(concat, "mtp_concat", il); + + ggml_tensor * cur = build_lora_mm(layer.nextn.eh_proj, concat, layer.nextn.eh_proj_s); + cb(cur, "mtp_eh_proj", il); + + ggml_tensor * inpL = cur; + + cur = build_norm(cur, layer.attn_norm, nullptr, cohere2moe_norm_type, il); + cb(cur, "mtp_attn_norm", il); + ggml_tensor * ffn_inp = cur; + + auto [Qcur, Kcur, Vcur] = build_qkv(layer, cur, n_embd_head, n_head, n_head_kv, il); + ggml_tensor * rope_factors = model.get_rope_factors(cparams, il); + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + + cb(Qcur, "mtp_Qcur", il); + cb(Kcur, "mtp_Kcur", il); + cb(Vcur, "mtp_Vcur", il); + + cur = build_attn(inp_attn, + layer.wo, layer.wo_b, layer.wo_s, + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, + 1.0f / sqrtf(float(n_embd_head)), il); + cb(cur, "mtp_attn_out", il); + + ggml_tensor * attn_out = cur; + + cur = build_moe_ffn(ffn_inp, + layer.ffn_gate_inp, + layer.ffn_up_exps, + layer.ffn_gate_exps, + layer.ffn_down_exps, + nullptr, + n_expert, n_expert_used, + LLM_FFN_SILU, hparams.expert_weights_norm, + hparams.expert_weights_scale, + (llama_expert_gating_func_type) hparams.expert_gating_func, + il, + nullptr, layer.ffn_gate_up_exps, + layer.ffn_up_exps_s, + layer.ffn_gate_exps_s, + layer.ffn_down_exps_s); + cb(cur, "mtp_ffn_moe_out", il); + + if (layer.ffn_up_shexp) { + ggml_tensor * ffn_shexp = build_ffn(ffn_inp, + layer.ffn_up_shexp, nullptr, layer.ffn_up_shexp_s, + layer.ffn_gate_shexp, nullptr, layer.ffn_gate_shexp_s, + layer.ffn_down_shexp, nullptr, layer.ffn_down_shexp_s, + nullptr, LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(ffn_shexp, "mtp_ffn_shexp", il); + + cur = ggml_add(ctx0, cur, ffn_shexp); + cur = ggml_scale(ctx0, cur, 0.5f); + cb(cur, "mtp_ffn_out", il); + } + + cur = ggml_add(ctx0, cur, inpL); + cur = ggml_add(ctx0, cur, attn_out); + cb(cur, "mtp_post_ffn", il); + + ggml_tensor * head_norm_w = layer.nextn.shared_head_norm + ? layer.nextn.shared_head_norm + : model.output_norm; + GGML_ASSERT(head_norm_w && "COHERE2MOE MTP: missing both nextn.shared_head_norm and output_norm"); + cur = build_norm(cur, head_norm_w, nullptr, cohere2moe_norm_type, -1); + + cb(cur, "h_nextn", -1); + res->t_h_nextn = cur; + + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + cb(cur, "mtp_shared_head_norm", -1); + + ggml_tensor * head_w = layer.nextn.shared_head_head ? layer.nextn.shared_head_head : model.output; + GGML_ASSERT(head_w && "COHERE2MOE MTP: missing LM head (nextn.shared_head_head or model.output)"); + cur = build_lora_mm(head_w, cur, layer.nextn.shared_head_head ? layer.nextn.shared_head_head_s : nullptr); + + if (hparams.f_logit_scale) { + cur = ggml_scale(ctx0, cur, hparams.f_logit_scale); + } + + cb(cur, "result_output", -1); + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); +} diff --git a/src/models/eagle3.cpp b/src/models/eagle3.cpp new file mode 100644 index 000000000000..9d96fae5944e --- /dev/null +++ b/src/models/eagle3.cpp @@ -0,0 +1,323 @@ +#include "models.h" + +void llama_model_eagle3::load_arch_hparams(llama_model_loader & ml) { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + + if (!ml.get_arr(LLM_KV_TARGET_LAYERS, target_layer_ids, false)) { + throw std::runtime_error("EAGLE3 model requires 'extract_layers' in GGUF metadata"); + } + if (target_layer_ids.size() != 3) { + throw std::runtime_error("EAGLE3 requires exactly 3 entries in 'extract_layers'"); + } + LLAMA_LOG_INFO("%s: EAGLE3 extract_layers = [%d, %d, %d]\n", __func__, + target_layer_ids[0], + target_layer_ids[1], + target_layer_ids[2]); + + uint32_t n_embd_tgt = 0; + + ml.get_key(LLM_KV_TARGET_HIDDEN_SIZE, n_embd_tgt); + LLAMA_LOG_INFO("%s: EAGLE3 n_embd_tgt = %u (draft n_embd = %u)\n", __func__, n_embd_tgt, hparams.n_embd); + + hparams.n_embd_inp_enc_impl = (uint32_t) target_layer_ids.size() * n_embd_tgt; + + // eagle3 norm_before_residual (optional, default false) + // compatible with Readhat eagle3 speculator model + ml.get_key(LLM_KV_NORM_BEFORE_RESIDUAL, hparams.norm_before_residual, false); + if (hparams.norm_before_residual) { + LLAMA_LOG_INFO("%s: EAGLE3gnorm_before_residual = true\n", __func__); + } + + type = LLM_TYPE_UNKNOWN; +} + +void llama_model_eagle3::load_arch_tensors(llama_model_loader &) { + LLAMA_LOAD_LOCALS; + + const int64_t n_embd_inp = hparams.n_embd_inp_enc(); + const int64_t n_embd_attn_input = 2 * n_embd; + + // Get vocab size from the d2t tensor in the GGUF file (optional - only needed if eagle3 has different vocab_size than target) + // d2t: draft to target vocabulary mapping + int64_t n_draft_vocab = n_vocab; // Default: same as target vocab + const struct ggml_tensor * d2t_meta = ml->get_tensor_meta("d2t"); + if (d2t_meta) { + n_draft_vocab = d2t_meta->ne[0]; // update draft vocab size + d2t = create_tensor(tn(LLM_TENSOR_D2T), {n_draft_vocab}, 0); + LLAMA_LOG_INFO("%s: EAGLE3 using d2t mapping (draft_vocab_size = %lld)\n", __func__, (long long)n_draft_vocab); + } else { + d2t = nullptr; // no d2t, use default vocab size + LLAMA_LOG_INFO("%s: EAGLE3 without d2t - sharing same vocab_size with target (vocab_size = %lld)\n", __func__, (long long)n_draft_vocab); + } + + // Feature fusion layer: projects 3 target layers to draft hidden size + fc = create_tensor(tn(LLM_TENSOR_FC, "weight"), {n_embd_inp, n_embd}, 0); + + // Output layer (uses draft vocab size) + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_draft_vocab}, TENSOR_NOT_REQUIRED); + + // Token embeddings (optional - Llama 3.3 70B EAGLE3 has its own) + const struct ggml_tensor * tok_embd_meta = ml->get_tensor_meta(tn(LLM_TENSOR_TOKEN_EMBD, "weight").str().c_str()); + if (tok_embd_meta) { + const int64_t n_target_vocab = tok_embd_meta->ne[1]; + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_target_vocab}, 0); + LLAMA_LOG_INFO("%s: EAGLE3 using its own token_embd (vocab = %lld)\n", __func__, (long long)n_target_vocab); + } + + // Single decoder layer + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + // input_layernorm: applied to token embeddings + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); + + // eagle3 specific: hidden_norm applied to fused target features + layer.attn_norm_2 = create_tensor(tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, 0); + + // Attention takes input_embeds_normed + fused_target_normed as input + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd_attn_input, n_embd_head_k * n_head}, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd_attn_input, n_embd_k_gqa}, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd_attn_input, n_embd_v_gqa}, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0); + + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + + // rope_freqs for llama3 rope scaling (optional - only if eagle3 config has rope_scaling) + layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED); + } +} + +std::unique_ptr llama_model_eagle3::build_arch_graph(const llm_graph_params & params) const { + switch (params.gtype) { + case LLM_GRAPH_TYPE_ENCODER: + return std::make_unique>(*this, params); + case LLM_GRAPH_TYPE_DEFAULT: + case LLM_GRAPH_TYPE_DECODER: + return std::make_unique>(*this, params); + default: + GGML_ABORT("invalid graph type"); + }; +} + +template <> +ggml_tensor * llama_model_eagle3::graph::build_inp_embd_enc() const { + ggml_tensor * cur = nullptr; + + // Input: Target model features (3 layers concatenated: low, mid, high) + // Data will be provided via ubatch->embd in encode_eagle3_features() + auto inp_target = std::make_unique(hparams.n_embd_inp_enc()); + inp_target->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hparams.n_embd_inp_enc(), n_tokens); + ggml_set_input(inp_target->embd); + + cur = inp_target->embd; + cb(cur, "inp_embd", -1); + + res->add_input(std::move(inp_target)); + + return cur; +} + +// eagle3 Encoder: processes target model features through feature fusion layer +// Input: target_features e.g. [12288, n_tokens] from target model layers low, middle, high +// Output: g_embeddings e.g. [4096, n_tokens] stored in context +template <> +llama_model_eagle3::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { + ggml_tensor * cur = nullptr; + + cur = build_inp_embd_enc(); + + // Feature fusion layer + cur = build_lora_mm(model.fc, cur); + cb(cur, "fc_out", -1); + + // Output: g_embeddings e.g. [4096, n_tokens] + // store in t_h_nextn (same as MTP) so can be read via llama_get_embeddings_nextn(ctx_dft) + ggml_set_output(cur); + res->t_h_nextn = cur; + + ggml_build_forward_expand(gf, cur); +} + +// eagle3 Decoder: processes draft tokens using g_embeddings from encoder +// Input: draft tokens + g_embeddings from encoder +// Output: draft logits +template <> +llama_model_eagle3::graph::graph(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { + const int64_t n_embd_head = hparams.n_embd_head_v(); + + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k()); + GGML_ASSERT(n_layer == 1); // eagle3 has only one decoder layer + + ggml_tensor * cur; + ggml_tensor * inpL; + + // eagle3 Decoder receives: + // 1. Token embeddings (e.g.from eagle3's own tok_embd for Llama 3.3 70B, or target model for Llama 3.1 8B) + // 2. g_embeddings from encoder + auto * tok_embd = model.tok_embd; + if (model.tok_embd == nullptr) { + GGML_ASSERT(cparams.ctx_other != nullptr); + const auto * model_other = llama_get_model(cparams.ctx_other); + + GGML_ASSERT(model_other->tok_embd != nullptr && "EAGLE3 decoder requires token embeddings (own or from target model)"); + tok_embd = model_other->tok_embd; + } + + auto inp = std::make_unique(n_embd); + + inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); + ggml_set_input(inp->tokens); + + inp->embd = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); + ggml_set_input(inp->embd); + + ggml_tensor * inp_embd = ggml_get_rows(ctx0, tok_embd, inp->tokens); + cb(inp_embd, "inp_embd", -1); + + ggml_tensor * inp_g = inp->embd; + cb(inp_g, "inp_g_embeddings", -1); + + res->add_input(std::move(inp)); + + inpL = inp_g; + + // inp_pos - contains the positions + ggml_tensor * inp_pos = build_inp_pos(); + + auto * inp_attn = build_attn_inp_kv(); + + const float kq_scale = 1.0f/sqrtf(float(n_embd_head)); + + // Single decoder layer (il = 0) + const int il = 0; + { + // Apply input_layernorm to the token embeddings + ggml_tensor * embd_norm = build_norm(inp_embd, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + cb(embd_norm, "embd_norm", il); + + // Apply hidden_norm to inp_g + ggml_tensor * g_norm = build_norm(inp_g, + model.layers[il].attn_norm_2, NULL, + LLM_NORM_RMS, -1); + cb(g_norm, "g_norm", il); + + // norm_before_residual: determines what goes into the residual connection (compatible with Readhat eagle3 speculator model) + // - false (default): use raw inp_g for residual + // - true: use normalized g_norm for residual + // inpL is the concatenated input (normalized inp_embd + normalized inp_g) + ggml_tensor * inpSA = hparams.norm_before_residual ? g_norm : inpL; + + // Concatenate normalized inp_embd and normalized inp_g + cur = ggml_concat(ctx0, embd_norm, g_norm, il); + cb(cur, "concat_embd", il); + + // Self-attention with concatenated input + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + + // rope freq factors, returns nullptr if not available + ggml_tensor * rope_factors = model.get_rope_factors(cparams, il); + + // RoPE + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + cb(Qcur, "Qcur_rope", il); + cb(Kcur, "Kcur_rope", il); + + cur = build_attn(inp_attn, + model.layers[il].wo, NULL, nullptr, + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); + + // Add residual and update it + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // Apply FFN norm to the sum + cur = build_norm(ffn_inp, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "post_attn_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + + // Output norm with residual + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "eagle3_prenorm", il); + + inpL = cur; + } + + cur = inpL; + + // Output prenorm state (for next token's g_embeddings in autoregressive generation) + ggml_set_output(cur); + res->t_h_nextn = cur; + + cur = build_norm(cur, + model.output_norm, NULL, + LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + + // lm_head - projects to draft vocabulary + // if the draft has no own output projection, inherit the target model's lm_head + auto * output = model.output; + if (output == nullptr) { + GGML_ASSERT(cparams.ctx_other != nullptr); + const auto * model_other = llama_get_model(cparams.ctx_other); + + GGML_ASSERT(model_other->output != nullptr && "EAGLE3 decoder requires an output projection (own or from target model)"); + output = model_other->output; + } + cur = build_lora_mm(output, cur); + + if (model.d2t) { + const int64_t n_draft_vocab = cur->ne[0]; + const int64_t n_outputs = cur->ne[1]; + const int64_t n_vocab = (int64_t) model.vocab.n_tokens(); + + GGML_ASSERT(model.d2t->type == GGML_TYPE_I64); + GGML_ASSERT(model.d2t->ne[0] == n_draft_vocab); + + ggml_tensor * logits = ggml_fill(ctx0, ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, n_vocab, n_outputs), -INFINITY); + cur = ggml_set_rows(ctx0, logits, + ggml_reshape_3d(ctx0, cur, 1, n_draft_vocab, n_outputs), + ggml_reshape_3d(ctx0, model.d2t, n_draft_vocab, 1, 1)); + cur = ggml_reshape_2d(ctx0, cur, n_vocab, n_outputs); + } + + cb(cur, "result_output", -1); + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); +} diff --git a/src/models/gemma4.cpp b/src/models/gemma4.cpp index 6f7fcd645cbd..6a96979cebde 100644 --- a/src/models/gemma4.cpp +++ b/src/models/gemma4.cpp @@ -210,6 +210,8 @@ llama_model_gemma4::graph::graph(const llama_model & model, const llm_graph_para const float freq_scale_l = model.get_rope_freq_scale(cparams, il); const int n_rot_l = hparams.n_rot(il); + res->t_layer_inp[il] = inpL; + // norm cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); diff --git a/src/models/glm-dsa.cpp b/src/models/glm-dsa.cpp index 11d91312defc..32fe6def6f3c 100644 --- a/src/models/glm-dsa.cpp +++ b/src/models/glm-dsa.cpp @@ -101,11 +101,11 @@ void llama_model_glm_dsa::load_arch_tensors(llama_model_loader &) { layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, flags); // DSA indexer - layer.indexer_k_norm = create_tensor(tn(LLM_TENSOR_INDEXER_K_NORM, "weight", i), {hparams.indexer_head_size}, flags); - layer.indexer_k_norm_b = create_tensor(tn(LLM_TENSOR_INDEXER_K_NORM, "bias", i), {hparams.indexer_head_size}, flags); - layer.indexer_proj = create_tensor(tn(LLM_TENSOR_INDEXER_PROJ, "weight", i), {n_embd, hparams.indexer_n_head}, flags); - layer.indexer_attn_k = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_K, "weight", i), {n_embd, hparams.indexer_head_size}, flags); - layer.indexer_attn_q_b = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_Q_B, "weight", i), {q_lora_rank, hparams.indexer_n_head * hparams.indexer_head_size}, flags); + layer.indexer_k_norm = create_tensor(tn(LLM_TENSOR_INDEXER_K_NORM, "weight", i), {hparams.indexer_head_size}, flags | TENSOR_NOT_REQUIRED); + layer.indexer_k_norm_b = create_tensor(tn(LLM_TENSOR_INDEXER_K_NORM, "bias", i), {hparams.indexer_head_size}, flags | TENSOR_NOT_REQUIRED); + layer.indexer_proj = create_tensor(tn(LLM_TENSOR_INDEXER_PROJ, "weight", i), {n_embd, hparams.indexer_n_head}, flags | TENSOR_NOT_REQUIRED); + layer.indexer_attn_k = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_K, "weight", i), {n_embd, hparams.indexer_head_size}, flags | TENSOR_NOT_REQUIRED); + layer.indexer_attn_q_b = create_tensor(tn(LLM_TENSOR_INDEXER_ATTN_Q_B, "weight", i), {q_lora_rank, hparams.indexer_n_head * hparams.indexer_head_size}, flags | TENSOR_NOT_REQUIRED); if (i < (int) hparams.n_layer_dense_lead) { layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, flags); layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, flags); diff --git a/src/models/lfm2.cpp b/src/models/lfm2.cpp index 97da8a6abb84..70e837d6eb2f 100644 --- a/src/models/lfm2.cpp +++ b/src/models/lfm2.cpp @@ -13,6 +13,7 @@ void llama_model_lfm2::load_arch_hparams(llama_model_loader & ml) { hparams.n_layer_dense_lead = hparams.n_layer(); switch (hparams.n_ff()) { + case 2560: type = LLM_TYPE_230M; break; case 4608: type = LLM_TYPE_350M; break; case 6912: type = LLM_TYPE_700M; break; case 8192: type = LLM_TYPE_1_2B; break; @@ -190,7 +191,15 @@ llama_model_lfm2::graph::graph(const llama_model & model, const llm_graph_ auto * conv_rs = build_rs(inp_recr, conv_state, hparams.n_embd_r(), n_seqs); auto * conv = ggml_reshape_3d(ctx0, conv_rs, d_conv, hparams.n_embd, n_seqs); - bx = ggml_concat(ctx0, conv, bx, 0); + // causal prepends the state, non-causal pads symmetrically for a centered window + if (hparams.causal_attn) { + bx = ggml_concat(ctx0, conv, bx, 0); + } else { + const int64_t pad = (hparams.n_shortconv_l_cache - 1) / 2; + auto * left = ggml_cont(ctx0, + ggml_view_3d(ctx0, conv, pad, hparams.n_embd, n_seqs, conv->nb[1], conv->nb[2], (d_conv - pad) * conv->nb[0])); + bx = ggml_pad_ext(ctx0, ggml_concat(ctx0, left, bx, 0), 0, pad, 0, 0, 0, 0, 0, 0); + } GGML_ASSERT(bx->ne[0] > conv->ne[0]); // last d_conv columns is a new conv state @@ -266,10 +275,12 @@ llama_model_lfm2::graph::graph(const llama_model & model, const llm_graph_ cb(cur, "result_norm", -1); res->t_embd = cur; - cur = build_lora_mm(model.output, cur, model.output_s); - cb(cur, "result_output", -1); + if (!cparams.embeddings) { + cur = build_lora_mm(model.output, cur, model.output_s); + cb(cur, "result_output", -1); - res->t_logits = cur; + res->t_logits = cur; + } ggml_build_forward_expand(gf, cur); } diff --git a/src/models/llama.cpp b/src/models/llama.cpp index c0ec7e0a9adb..4bfebc8843c6 100644 --- a/src/models/llama.cpp +++ b/src/models/llama.cpp @@ -124,6 +124,8 @@ llama_model_llama::graph::graph(const llama_model & model, const llm_grap ggml_tensor * inp_out_ids = build_inp_out_ids(); for (int il = 0; il < n_layer; ++il) { + res->t_layer_inp[il] = inpL; + ggml_tensor * inpSA = inpL; // norm diff --git a/src/models/mamba-base.cpp b/src/models/mamba-base.cpp index c37f29c487ed..fd3fe3f03230 100644 --- a/src/models/mamba-base.cpp +++ b/src/models/mamba-base.cpp @@ -169,7 +169,6 @@ ggml_tensor * llm_build_mamba_base::build_mamba2_layer(llm_graph_input_rs * inp, GGML_ASSERT(ubatch.equal_seqs()); GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs); GGML_ASSERT(d_inner % n_head == 0); - GGML_ASSERT(d_inner % d_state == 0); GGML_ASSERT(d_inner % n_group == 0); ggml_tensor * conv_states_all = mctx_cur->get_r_l(il); diff --git a/src/models/mamba2.cpp b/src/models/mamba2.cpp index c5951cf0f7fc..d5c167cf0560 100644 --- a/src/models/mamba2.cpp +++ b/src/models/mamba2.cpp @@ -39,10 +39,11 @@ void llama_model_mamba2::load_arch_tensors(llama_model_loader &) { const int64_t d_inner = hparams.ssm_d_inner; const int64_t d_state = hparams.ssm_d_state; const int64_t n_group = hparams.ssm_n_group; - const int64_t d_in_proj = 2*d_inner + 2*n_group*d_state + n_head; + const int64_t dt_rank = hparams.ssm_dt_rank; + + const int64_t conv_dim = d_inner + 2 * n_group * d_state; + const int64_t d_in_proj = d_inner + conv_dim + dt_rank; - // only an expansion factor of 2 is supported for now - GGML_ASSERT(2 * n_embd == d_inner); tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); @@ -68,11 +69,11 @@ void llama_model_mamba2::load_arch_tensors(llama_model_loader &) { layer.ssm_conv1d = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner + 2*n_group*d_state}, 0); layer.ssm_conv1d_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner + 2*n_group*d_state}, 0); - layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {n_head}, 0); + layer.ssm_dt_b = create_tensor(tn(LLM_TENSOR_SSM_DT, "bias", i), {dt_rank}, 0); // no "weight" suffix for these - layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, n_head}, 0); - layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, n_head}, 0); + layer.ssm_a = create_tensor(tn(LLM_TENSOR_SSM_A, i), {1, dt_rank}, 0); + layer.ssm_d = create_tensor(tn(LLM_TENSOR_SSM_D, i), {1, dt_rank}, 0); layer.ssm_norm = create_tensor(tn(LLM_TENSOR_SSM_NORM, "weight", i), {d_inner / n_group, n_group}, 0); diff --git a/src/models/models.h b/src/models/models.h index 12f64c20ea5a..2ac8415a3639 100644 --- a/src/models/models.h +++ b/src/models/models.h @@ -937,6 +937,23 @@ struct llama_model_cohere2 : public llama_model_base { }; +struct llama_model_cohere2moe : public llama_model_base { + llama_model_cohere2moe(const struct llama_model_params & params) : llama_model_base(params) {} + void load_arch_hparams(llama_model_loader & ml) override; + void load_arch_tensors(llama_model_loader & ml) override; + + struct graph : public llm_graph_context { + graph(const llama_model & model, const llm_graph_params & params); + }; + + struct graph_mtp : public llm_graph_context { + graph_mtp(const llama_model & model, const llm_graph_params & params); + }; + + std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; +}; + + struct llama_model_dbrx : public llama_model_base { llama_model_dbrx(const struct llama_model_params & params) : llama_model_base(params) {} void load_arch_hparams(llama_model_loader & ml) override; @@ -1089,6 +1106,21 @@ struct llama_model_glm_dsa : public llama_model_base { std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; +struct llama_model_eagle3 : public llama_model_base { + llama_model_eagle3(const struct llama_model_params & params) : llama_model_base(params) {} + void load_arch_hparams(llama_model_loader & ml) override; + void load_arch_tensors(llama_model_loader & ml) override; + + template + struct graph : public llm_graph_context { + graph(const llama_model & model, const llm_graph_params & params); + + ggml_tensor * build_inp_embd_enc() const; + }; + + std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; +}; + struct llama_model_mistral4 : public llama_model_deepseek2 { llama_model_mistral4(const struct llama_model_params & params) : llama_model_deepseek2(params) {} diff --git a/src/models/openai-moe.cpp b/src/models/openai-moe.cpp index 3ab15d61f08c..6d74f9c7e6ef 100644 --- a/src/models/openai-moe.cpp +++ b/src/models/openai-moe.cpp @@ -75,6 +75,8 @@ llama_model_openai_moe::graph::graph(const llama_model & model, const llm_graph_ ggml_tensor * inp_out_ids = build_inp_out_ids(); for (int il = 0; il < n_layer; ++il) { + res->t_layer_inp[il] = inpL; + const float freq_base_l = model.get_rope_freq_base (cparams, il); const float freq_scale_l = model.get_rope_freq_scale(cparams, il); diff --git a/src/models/qwen3.cpp b/src/models/qwen3.cpp index 1d0d2fab362a..f4b2a2aebe0f 100644 --- a/src/models/qwen3.cpp +++ b/src/models/qwen3.cpp @@ -69,6 +69,8 @@ llama_model_qwen3::graph::graph(const llama_model & model, const llm_graph_param ggml_tensor * inp_out_ids = build_inp_out_ids(); for (int il = 0; il < n_layer; ++il) { + res->t_layer_inp[il] = inpL; + ggml_tensor * inpSA = inpL; // norm diff --git a/src/models/qwen35.cpp b/src/models/qwen35.cpp index 4b642cff467c..d8ffe43ae76c 100644 --- a/src/models/qwen35.cpp +++ b/src/models/qwen35.cpp @@ -156,6 +156,8 @@ llama_model_qwen35::graph::graph(const llama_model & model, const llm_graph_para // MTP/NextN layers are loaded as extra decoder blocks but not executed in the main pass. for (int il = 0; il < n_layer; ++il) { + res->t_layer_inp[il] = inpL; + ggml_tensor * inpSA = inpL; cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); @@ -173,7 +175,7 @@ llama_model_qwen35::graph::graph(const llama_model & model, const llm_graph_para } if (il == n_layer - 1 && inp_out_ids && cparams.embeddings_nextn_masked) { - cur = ggml_get_rows(ctx0, cur, inp_out_ids); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } diff --git a/src/models/qwen35moe.cpp b/src/models/qwen35moe.cpp index eb5e9a406a15..7b0876cbb04b 100644 --- a/src/models/qwen35moe.cpp +++ b/src/models/qwen35moe.cpp @@ -179,6 +179,8 @@ llama_model_qwen35moe::graph::graph(const llama_model & model, const llm_graph_p // MTP/NextN layers are loaded as extra decoder blocks but not executed in the main pass. for (int il = 0; il < n_layer; ++il) { + res->t_layer_inp[il] = inpL; + ggml_tensor * inpSA = inpL; cur = build_norm(inpL, model.layers[il].attn_norm, nullptr, LLM_NORM_RMS, il); diff --git a/src/models/qwen3moe.cpp b/src/models/qwen3moe.cpp index 317e668bec79..6f6df5390e33 100644 --- a/src/models/qwen3moe.cpp +++ b/src/models/qwen3moe.cpp @@ -78,6 +78,8 @@ llama_model_qwen3moe::graph::graph(const llama_model & model, const llm_graph_pa ggml_tensor * inp_out_ids = build_inp_out_ids(); for (int il = 0; il < n_layer; ++il) { + res->t_layer_inp[il] = inpL; + ggml_tensor * inpSA = inpL; // norm diff --git a/src/models/step35.cpp b/src/models/step35.cpp index e2218c587044..9b7b18a3678b 100644 --- a/src/models/step35.cpp +++ b/src/models/step35.cpp @@ -112,7 +112,7 @@ void llama_model_step35::load_arch_tensors(llama_model_loader & ml) { layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, TENSOR_NOT_REQUIRED); }; - auto load_block_mtp = [&](int i, bool is_first_mtp) { + auto load_block_mtp = [&](int i) { auto & layer = layers[i]; const uint32_t n_head_l = hparams.n_head(i); @@ -121,15 +121,12 @@ void llama_model_step35::load_arch_tensors(llama_model_loader & ml) { // The MTP block is a full Step3p5 decoder layer (mtp_block) plus the // NextN-specific wiring (enorm/hnorm/eh_proj + optional shared head). - // `mtp_flags` becomes NOT_REQUIRED when the GGUF is trunk-only. - // - // Only the FIRST MTP block (i == n_main) is required for the - // single-block MTP runtime; trailing MTP blocks are always tolerated - // as missing so pruned GGUFs (block 0 only) load cleanly. Override - // mtp_flags to NOT_REQUIRED for those. - const int eff_mtp_flags = is_first_mtp ? mtp_flags : (mtp_flags | TENSOR_NOT_REQUIRED); - - layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, eff_mtp_flags); + // Multi-block MTP: every declared MTP block is required (the draft chain + // runs all n_layer_nextn heads), so each block uses the captured + // `mtp_flags` directly — already NOT_REQUIRED for a trunk-only GGUF, + // which keeps that path correct. + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, mtp_flags); layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, TENSOR_NOT_REQUIRED); layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, TENSOR_NOT_REQUIRED); @@ -140,12 +137,12 @@ void llama_model_step35::load_arch_tensors(llama_model_loader & ml) { layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot_max/2}, TENSOR_NOT_REQUIRED | TENSOR_DUPLICATED); } - create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head_l, n_embd_k_gqa, n_embd_v_gqa, eff_mtp_flags); - layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_v * n_head_l, n_embd}, eff_mtp_flags); + create_tensor_qkv(layer, i, n_embd, n_embd_head_k * n_head_l, n_embd_k_gqa, n_embd_v_gqa, mtp_flags); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_v * n_head_l, n_embd}, mtp_flags); layer.wqkv_gate = create_tensor(tn(LLM_TENSOR_ATTN_GATE, "weight", i), {n_embd, n_head_l}, TENSOR_NOT_REQUIRED); - layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, eff_mtp_flags); + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, mtp_flags); // dense MLP (leading dense blocks) — present if the MTP block isn't MoE layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, TENSOR_NOT_REQUIRED); @@ -165,9 +162,9 @@ void llama_model_step35::load_arch_tensors(llama_model_loader & ml) { layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), {hparams.n_ff_shexp, n_embd}, TENSOR_NOT_REQUIRED); // NextN-specific tensors that define the MTP block. - layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, eff_mtp_flags); - layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, eff_mtp_flags); - layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, eff_mtp_flags); + layer.nextn.eh_proj = create_tensor(tn(LLM_TENSOR_NEXTN_EH_PROJ, "weight", i), { 2 * n_embd, n_embd }, mtp_flags); + layer.nextn.enorm = create_tensor(tn(LLM_TENSOR_NEXTN_ENORM, "weight", i), { n_embd }, mtp_flags); + layer.nextn.hnorm = create_tensor(tn(LLM_TENSOR_NEXTN_HNORM, "weight", i), { n_embd }, mtp_flags); layer.nextn.embed_tokens = create_tensor(tn(LLM_TENSOR_NEXTN_EMBED_TOKENS, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED); layer.nextn.shared_head_head = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_HEAD, "weight", i), { n_embd, n_vocab }, TENSOR_NOT_REQUIRED); layer.nextn.shared_head_norm = create_tensor(tn(LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "weight", i), { n_embd }, TENSOR_NOT_REQUIRED); @@ -176,13 +173,11 @@ void llama_model_step35::load_arch_tensors(llama_model_loader & ml) { for (int i = 0; i < n_layer; ++i) { load_block_trunk(i, trunk_flags); } - // Only the first MTP block (i == n_main) is required at runtime — the - // single-block-MTP graph in build_arch_graph always uses that one. - // Trailing MTP blocks are loaded if present (so an un-pruned GGUF with - // all MTP layers still works) but tolerated when absent via the pruning - // path. See scripts/prune_step35_extra_mtp.py for the pruner. + // All n_layer_nextn MTP blocks are required — the multi-block draft chain + // runs every head (head k at offset k). The GGUF declares the count via + // step35.nextn_predict_layers. for (int i = n_layer; i < n_layer_all; ++i) { - load_block_mtp(i, /*is_first_mtp=*/ i == n_layer); + load_block_mtp(i); } } @@ -372,13 +367,14 @@ llama_model_step35::graph_mtp::graph_mtp(const llama_model & model, const llm_gr : llm_graph_context(params) { GGML_ASSERT(hparams.n_layer_nextn > 0 && "STEP35 MTP requires n_layer_nextn > 0"); - // Single-block MTP only: always run the first trained MTP block (Qwen - // MTP / vLLM single-MTP-layer style). Multi-block round-robin proved to - // be a much deeper refactor than this PR justifies; the trailing MTP - // blocks are loaded with TENSOR_NOT_REQUIRED so pruned GGUFs (with just - // block 0) also work — see load_arch_tensors below and - // scripts/prune_step35_extra_mtp.py. - const int il = hparams.n_layer(); + // Multi-block MTP: the DECODER_MTP graph runs the MTP head selected by + // cparams.nextn_layer_offset (0 = first trained head). The speculative driver + // bumps the offset per draft step to chain heads 45->46->47. offset 0 keeps + // single-block behavior identical to before. + const int il = hparams.n_layer() + cparams.nextn_layer_offset; + GGML_ASSERT(cparams.nextn_layer_offset >= 0 && + cparams.nextn_layer_offset < (int) hparams.n_layer_nextn && + "nextn_layer_offset out of range [0, n_layer_nextn)"); const auto & layer = model.layers[il]; GGML_ASSERT(layer.nextn.eh_proj && "MTP block missing nextn.eh_proj"); @@ -536,6 +532,9 @@ llama_model_step35::graph_mtp::graph_mtp(const llama_model & model, const llm_gr cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "mtp_post_ffn", il); + ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + // Pre-norm hidden state: used by the AR draft loop to seed the next MTP step. cb(cur, "h_nextn", -1); res->t_h_nextn = cur; diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index e284a58d1c67..24592a27924b 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -199,7 +199,6 @@ llama_build_and_test(test-jinja.cpp) llama_test(test-jinja NAME test-jinja-py ARGS -py LABEL python) llama_build_and_test(test-chat-auto-parser.cpp WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}) llama_build_and_test(test-chat-template.cpp) -llama_build_and_test(test-json-partial.cpp) llama_build_and_test(test-log.cpp) llama_build_and_test( test-peg-parser.cpp @@ -303,9 +302,9 @@ target_link_libraries(${TEST_TARGET} PRIVATE llama) llama_build_and_test(test-alloc.cpp) target_include_directories(test-alloc PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src) -llama_build(export-graph-ops.cpp) -target_include_directories(export-graph-ops PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src) +llama_build(test-export-graph-ops.cpp) +target_include_directories(test-export-graph-ops PRIVATE ${PROJECT_SOURCE_DIR}/ggml/src) if (TARGET gguf-model-data) - target_link_libraries(export-graph-ops PRIVATE gguf-model-data) - target_compile_definitions(export-graph-ops PRIVATE LLAMA_HF_FETCH) + target_link_libraries(test-export-graph-ops PRIVATE gguf-model-data) + target_compile_definitions(test-export-graph-ops PRIVATE LLAMA_HF_FETCH) endif() diff --git a/tests/peg-parser/test-gbnf-generation.cpp b/tests/peg-parser/test-gbnf-generation.cpp index fe4bbbdd161f..60066a817b50 100644 --- a/tests/peg-parser/test-gbnf-generation.cpp +++ b/tests/peg-parser/test-gbnf-generation.cpp @@ -129,7 +129,154 @@ void test_gbnf_generation(testing &t) { }); assert_gbnf_equal(t, R"""( - root ::= ([^<] | "<" [^/] | "])* + root ::= until-0 + space ::= | " " | "\n"{1,2} [ \t]{0,20} + until-0 ::= | [<] until-0-01 | [^<] until-0 + until-0-01 ::= | [<] until-0-01 | [/] until-0-02 | [^/<] until-0 + until-0-02 ::= | [<] until-0-01 | [t] until-0-03 | [^] until-0 + )""", gbnf); + }); + + t.test("until grammar overlapping delimiter", [](testing &t) { + auto parser = build_peg_parser([](common_peg_parser_builder & p) { + return p.until("\n\n"); + }); + + auto gbnf = build_grammar([&](const common_grammar_builder & builder) { + parser.build_grammar(builder); + }); + + assert_gbnf_equal(t, R"""( + root ::= until-0 + space ::= | " " | "\n"{1,2} [ \t]{0,20} + until-0 ::= | [\n] until-0-01 | [^\n] until-0 + until-0-01 ::= | [\n] until-0-01 | [<] until-0-02 | [^\n<] until-0 + until-0-02 ::= | [\n] until-0-01 | [/] until-0-03 | [^\n/] until-0 + until-0-03 ::= | [\n] until-0-01 | [p] until-0-04 | [^\np] until-0 + until-0-04 ::= | [\n] until-0-01 | [a] until-0-05 | [^\na] until-0 + until-0-05 ::= | [\n] until-0-01 | [r] until-0-06 | [^\nr] until-0 + until-0-06 ::= | [\n] until-0-01 | [a] until-0-07 | [^\na] until-0 + until-0-07 ::= | [\n] until-0-01 | [m] until-0-08 | [^\nm] until-0 + until-0-08 ::= | [\n] until-0-01 | [e] until-0-09 | [^\ne] until-0 + until-0-09 ::= | [\n] until-0-01 | [t] until-0-10 | [^\nt] until-0 + until-0-10 ::= | [\n] until-0-01 | [e] until-0-11 | [^\ne] until-0 + until-0-11 ::= | [\n] until-0-01 | [r] until-0-12 | [^\nr] until-0 + until-0-12 ::= | [\n] until-0-01 | [>] until-0-13 | [^\n>] until-0 + until-0-13 ::= | [^\n] until-0 + )""", gbnf); + }); + + // DeepSeek-V3.2 tag prefix. The DSML token (|DSML|) embeds U+FF5C, + // so the delimiter mixes ASCII and multi-byte codepoints. + t.test("until grammar unicode delimiter", [](testing &t) { + auto parser = build_peg_parser([](common_peg_parser_builder & p) { + return p.until("<|DSML|"); + }); + + auto gbnf = build_grammar([&](const common_grammar_builder & builder) { + parser.build_grammar(builder); + }); + + assert_gbnf_equal(t, R"""( + root ::= until-0 + space ::= | " " | "\n"{1,2} [ \t]{0,20} + until-0 ::= | [<] until-0-01 | [^<] until-0 + until-0-01 ::= | [<] until-0-01 | [\uFF5C] until-0-02 | [^<\uFF5C] until-0 + until-0-02 ::= | [<] until-0-01 | [D] until-0-03 | [^") + p.literal("
"), ""); + }); + + auto gbnf = build_grammar([&](const common_grammar_builder & builder) { + parser.build_grammar(builder); + }); + + assert_gbnf_equal(t, R"""( + ac-3 ::= [<] ac-3-01 | [^<] ac-3 + ac-3-01 ::= [<] ac-3-01 | [/] ac-3-02 | [^/<] ac-3 + ac-3-02 ::= [<] ac-3-01 | [t] ac-3-03 | [^] | [<] ac-3-01 | [^<>] ac-3 + root ::= ac-3 + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", gbnf); + }); + + t.test("ac grammar terminates at first delimiter", [](testing &t) { + auto parser = build_peg_parser([](common_peg_parser_builder & p) { + return p.ac(p.until("\n\n") + p.literal("\n\n"), "\n\n"); + }); + + auto gbnf = build_grammar([&](const common_grammar_builder & builder) { + parser.build_grammar(builder); + }); + + assert_gbnf_equal(t, R"""( + ac-3 ::= [\n] ac-3-01 | [^\n] ac-3 + ac-3-01 ::= [\n] ac-3-01 | [<] ac-3-02 | [^\n<] ac-3 + ac-3-02 ::= [\n] ac-3-01 | [/] ac-3-03 | [^\n/] ac-3 + ac-3-03 ::= [\n] ac-3-01 | [p] ac-3-04 | [^\np] ac-3 + ac-3-04 ::= [\n] ac-3-01 | [a] ac-3-05 | [^\na] ac-3 + ac-3-05 ::= [\n] ac-3-01 | [r] ac-3-06 | [^\nr] ac-3 + ac-3-06 ::= [\n] ac-3-01 | [a] ac-3-07 | [^\na] ac-3 + ac-3-07 ::= [\n] ac-3-01 | [m] ac-3-08 | [^\nm] ac-3 + ac-3-08 ::= [\n] ac-3-01 | [e] ac-3-09 | [^\ne] ac-3 + ac-3-09 ::= [\n] ac-3-01 | [t] ac-3-10 | [^\nt] ac-3 + ac-3-10 ::= [\n] ac-3-01 | [e] ac-3-11 | [^\ne] ac-3 + ac-3-11 ::= [\n] ac-3-01 | [r] ac-3-12 | [^\nr] ac-3 + ac-3-12 ::= [\n] ac-3-01 | [>] ac-3-13 | [^\n>] ac-3 + ac-3-13 ::= [\n] | [^\n] ac-3 + root ::= ac-3 + space ::= | " " | "\n"{1,2} [ \t]{0,20} + )""", gbnf); + }); + + t.test("ac grammar multiple delimiters", [](testing &t) { + auto parser = build_peg_parser([](common_peg_parser_builder & p) { + return p.ac(p.eps(), std::vector{"ab", "cd", "ef"}); + }); + + auto gbnf = build_grammar([&](const common_grammar_builder & builder) { + parser.build_grammar(builder); + }); + + assert_gbnf_equal(t, R"""( + ac-1 ::= [a] ac-1-01 | [c] ac-1-03 | [e] ac-1-05 | [^ace] ac-1 + ac-1-01 ::= [b] | [a] ac-1-01 | [c] ac-1-03 | [e] ac-1-05 | [^abce] ac-1 + ac-1-03 ::= [d] | [a] ac-1-01 | [c] ac-1-03 | [e] ac-1-05 | [^acde] ac-1 + ac-1-05 ::= [f] | [a] ac-1-01 | [c] ac-1-03 | [e] ac-1-05 | [^acef] ac-1 + root ::= ac-1 space ::= | " " | "\n"{1,2} [ \t]{0,20} )""", gbnf); }); diff --git a/tests/test-arg-parser.cpp b/tests/test-arg-parser.cpp index 0dd8422e7366..e83ee85dd4ba 100644 --- a/tests/test-arg-parser.cpp +++ b/tests/test-arg-parser.cpp @@ -10,7 +10,7 @@ #undef NDEBUG #include -int main(void) { +static void test(void) { common_params params; printf("test-arg-parser: make sure there is no duplicated arguments in any examples\n\n"); @@ -210,3 +210,13 @@ int main(void) { printf("test-arg-parser: all tests OK\n\n"); } + +int main(void) { + try { + test(); + } catch (std::exception & e) { + fprintf(stderr, "test-arg-parser: exception: %s\n", e.what()); + return 1; + } + return 0; +} diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 8705da20b1dc..15b50209c850 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -130,12 +130,12 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m } } ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size()); - } else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) { + } else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16) { // This is going to create some weird integers though. - ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor)); + ggml_backend_tensor_set(tensor, data.data(), 0, nels * ggml_type_size(tensor->type)); } else if (tensor->type == GGML_TYPE_I64) { // Integers with a size of 8 bytes can be set by mirroring the float data, the specific values are again not really meaningful. - const size_t nbytes_half = ggml_nbytes(tensor)/2; + const size_t nbytes_half = nels * sizeof(float); ggml_backend_tensor_set(tensor, data.data(), 0*nbytes_half, nbytes_half); ggml_backend_tensor_set(tensor, data.data(), 1*nbytes_half, nbytes_half); } else { @@ -2890,12 +2890,17 @@ struct test_cpy : public test_case { const std::array ne_dst; const std::array permute_src; const std::array permute_dst; + const std::array dst_alloc; // if set, dst is a view into a larger buffer (strided) bool _src_use_permute; bool _dst_use_permute; bool _src_transpose; bool _use_dst_shape; + bool _use_dst_alloc; std::string vars() override { + if (_use_dst_alloc) { + return VARS_TO_STR8(type_src, type_dst, ne_src, ne_dst, permute_src, permute_dst, _src_transpose, dst_alloc); + } if (_use_dst_shape) { return VARS_TO_STR7(type_src, type_dst, ne_src, ne_dst, permute_src, permute_dst, _src_transpose); } @@ -2943,12 +2948,15 @@ struct test_cpy : public test_case { std::array ne_dst = {-1, -1, -1, -1}, std::array permute_src = {0, 0, 0, 0}, std::array permute_dst = {0, 0, 0, 0}, - bool transpose_src = false) + bool transpose_src = false, + std::array dst_alloc = {0, 0, 0, 0}) : type_src(type_src), type_dst(type_dst), ne_src(ne_src), ne_dst(ne_dst), permute_src(permute_src), permute_dst(permute_dst), + dst_alloc(dst_alloc), _src_use_permute(permute_src[0] + permute_src[1] + permute_src[2] + permute_src[3] > 0), _dst_use_permute(permute_dst[0] + permute_dst[1] + permute_dst[2] + permute_dst[3] > 0), _src_transpose(transpose_src), - _use_dst_shape(ne_dst[0] >= 0 && ne_dst[1] >= 0 && ne_dst[2] >= 0 && ne_dst[3] >= 0){} + _use_dst_shape(ne_dst[0] >= 0 && ne_dst[1] >= 0 && ne_dst[2] >= 0 && ne_dst[3] >= 0), + _use_dst_alloc(dst_alloc[0] > 0){} ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * src = ggml_new_tensor(ctx, type_src, 4, ne_src.data()); @@ -2966,12 +2974,23 @@ struct test_cpy : public test_case { } std::array dst_ne = _use_dst_shape ? ne_dst : std::array{src->ne[0], src->ne[1], src->ne[2], src->ne[3]}; - ggml_tensor * dst = ggml_new_tensor(ctx, type_dst, 4, dst_ne.data()); - ggml_set_name(dst, "dst"); + ggml_tensor * dst; - if (_dst_use_permute) { - dst = ggml_permute(ctx, dst, permute_dst[0], permute_dst[1], permute_dst[2], permute_dst[3]); - ggml_set_name(dst, "dst_permuted"); + if (_use_dst_alloc) { + // view a sub-block of a larger buffer -> strided dst + ggml_tensor * dst_buf = ggml_new_tensor(ctx, type_dst, 4, dst_alloc.data()); + ggml_set_name(dst_buf, "dst_buf"); + dst = ggml_view_4d(ctx, dst_buf, dst_ne[0], dst_ne[1], dst_ne[2], dst_ne[3], + dst_buf->nb[1], dst_buf->nb[2], dst_buf->nb[3], 0); + ggml_set_name(dst, "dst_view"); + } else { + dst = ggml_new_tensor(ctx, type_dst, 4, dst_ne.data()); + ggml_set_name(dst, "dst"); + + if (_dst_use_permute) { + dst = ggml_permute(ctx, dst, permute_dst[0], permute_dst[1], permute_dst[2], permute_dst[3]); + ggml_set_name(dst, "dst_permuted"); + } } ggml_tensor * out = ggml_cpy(ctx, src, dst); @@ -3298,21 +3317,29 @@ struct test_norm : public test_case { const std::array ne; const bool v; // whether a is a non-contiguous view const float eps; + const bool noncontig_rows; std::string vars() override { - return VARS_TO_STR4(type, ne, v, eps); + return VARS_TO_STR5(type, ne, v, eps, noncontig_rows); } test_norm(ggml_type type = GGML_TYPE_F32, std::array ne = {64, 5, 4, 3}, bool v = false, - float eps = 1e-6f) - : type(type), ne(ne), v(v), eps(eps) {} + float eps = 1e-6f, + bool noncontig_rows = false) + : type(type), ne(ne), v(v), eps(eps), noncontig_rows(noncontig_rows) {} ggml_tensor * build_graph(ggml_context * ctx) override { - ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data()); + const std::array ne_a = noncontig_rows ? + std::array{ ne[1], ne[0], ne[2], ne[3] } : ne; + ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data()); ggml_set_name(a, "a"); + if (noncontig_rows) { + a = ggml_permute(ctx, a, 1, 0, 2, 3); + ggml_set_name(a, "permuted a"); + } if (v) { a = ggml_view_4d(ctx, a, a->ne[0]/2, a->ne[1]/2, a->ne[2]/2, a->ne[3]/2, a->nb[1], a->nb[2], a->nb[3], 0); ggml_set_name(a, "view of a"); @@ -6193,21 +6220,29 @@ struct test_l2_norm : public test_case { const std::array ne; const float eps; bool v; + bool noncontig_rows; std::string vars() override { - return VARS_TO_STR4(type, ne, eps, v); + return VARS_TO_STR5(type, ne, eps, v, noncontig_rows); } test_l2_norm(ggml_type type = GGML_TYPE_F32, std::array ne = {64, 64, 320, 1}, float eps = 1e-12f, - bool v = false) - : type(type), ne(ne), eps(eps), v(v) {} + bool v = false, + bool noncontig_rows = false) + : type(type), ne(ne), eps(eps), v(v), noncontig_rows(noncontig_rows) {} ggml_tensor * build_graph(ggml_context * ctx) override { - ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data()); + const std::array ne_a = noncontig_rows ? + std::array{ ne[1], ne[0], ne[2], ne[3] } : ne; + ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne_a.data()); ggml_set_name(a, "a"); + if (noncontig_rows) { + a = ggml_permute(ctx, a, 1, 0, 2, 3); + ggml_set_name(a, "permuted a"); + } if (v) { a = ggml_view_4d(ctx, a, a->ne[0]/2, a->ne[1]/2, a->ne[2]/2, a->ne[3]/2, a->nb[1], a->nb[2], a->nb[3], 0); ggml_set_name(a, "view of a"); @@ -7957,6 +7992,9 @@ static std::vector> make_test_cases_eval() { } } } + for (auto kernel_type : {GGML_TYPE_F32, GGML_TYPE_F16}) { + test_cases.emplace_back(new test_conv_2d({ 256, 256, 192, 1 }, { 3, 3, 192, 96 }, kernel_type, 1, 1, 1, 1, 1, 1, false)); + } // sycl backend will limit task global_range < MAX_INT // test cases for 2D im2col with large input W and H (occurs in stable-diffusion) @@ -8086,6 +8124,7 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 1, 1, 2})); test_cases.emplace_back(new test_repeat(GGML_TYPE_I32, {10, 5, 4, ne3}, {2, 1, 1, 1})); test_cases.emplace_back(new test_repeat(GGML_TYPE_I16, {10, 5, 4, ne3}, {1, 1, 1, 2})); + test_cases.emplace_back(new test_repeat(GGML_TYPE_BF16, {10, 5, 4, ne3}, {2, 1, 1, 1})); } for (bool view : {false, true}) { @@ -8159,6 +8198,10 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_I32, {256, 4, 1, 1}, {-1,-1,-1,-1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true)); test_cases.emplace_back(new test_cpy(GGML_TYPE_I32, GGML_TYPE_I32, {256, 1, 4, 1}, {-1,-1,-1,-1}, {1, 2, 0, 3}, {0, 0, 0, 0})); test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 1, 4, 1}, {-1,-1,-1,-1}, {1, 2, 0, 3}, {0, 0, 0, 0})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {2, 2097121, 1, 1}, {-1,-1,-1,-1}, {1, 0, 2, 3})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {2, 2, 524281, 1}, {-1,-1,-1,-1}, {1, 0, 2, 3})); + test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {128, 2, 3, 1}, {128, 2, 3, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, false, {128, 4, 3, 1})); // strided dst + test_cases.emplace_back(new test_cpy(GGML_TYPE_F16, GGML_TYPE_F16, {128, 2, 3, 1}, {128, 2, 3, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, false, {128, 4, 3, 1})); // strided dst // CPY - different src/dst shapes (reshaping via CPY) // Use permutations of {3, 5, 7, 32}. Total elements: 3*5*7*32 = 3360. @@ -8281,9 +8324,11 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_norm(GGML_TYPE_F32, { n, 5, 4, 3 }, v, eps)); test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, { n, 5, 4, 3 }, v, eps)); } + test_cases.emplace_back(new test_norm(GGML_TYPE_F32, { n, 5, 4, 3 }, false, eps, true)); test_cases.emplace_back(new test_rms_norm_back(GGML_TYPE_F32, { n, 5, 4, 3 }, eps)); test_cases.emplace_back(new test_l2_norm(GGML_TYPE_F32, { n, 5, 4, 3 }, eps, false)); test_cases.emplace_back(new test_l2_norm(GGML_TYPE_F32, { n, 5, 4, 3 }, eps, true)); + test_cases.emplace_back(new test_l2_norm(GGML_TYPE_F32, { n, 5, 4, 3 }, eps, false, true)); } } @@ -8401,6 +8446,11 @@ static std::vector> make_test_cases_eval() { } } + test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q4_0, GGML_TYPE_F32, 2880, 32, 2880, {1, 1}, {1, 1})); + test_cases.emplace_back(new test_mul_mat(GGML_TYPE_Q8_0, GGML_TYPE_F32, 2880, 32, 2880, {1, 1}, {1, 1})); + test_cases.emplace_back(new test_mul_mat(GGML_TYPE_MXFP4, GGML_TYPE_F32, 2880, 32, 2880, {1, 1}, {1, 1})); + + #if 0 { // Test paths in OpenCL @@ -8432,6 +8482,7 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, k, {3, 2}, {2, 1})); test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, k, {3, 2}, {1, 2})); test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, k, {3, 2}, {2, 2})); + test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 4, k, {3, 2}, {2, 2})); test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, k, {1, 1}, {1, 1})); test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, k, {1, 1}, {2, 1})); @@ -8448,6 +8499,7 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, k, {2, 3}, {1, 1}, {0, 1, 3, 2})); test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, k, {2, 3}, {1, 1}, {0, 3, 2, 1})); + test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 4, k, {2, 3}, {1, 1}, {0, 3, 2, 1})); test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 8, k, {2, 3}, {1, 1}, {0, 2, 1, 3})); test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 8, k, {2, 3}, {1, 1}, {0, 1, 3, 2})); test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 8, k, {2, 3}, {1, 1}, {0, 3, 2, 1})); @@ -8573,6 +8625,7 @@ static std::vector> make_test_cases_eval() { // gpt-oss issue with Vulkan mmq_id test_cases.emplace_back(new test_mul_mat_id(GGML_TYPE_MXFP4, GGML_TYPE_F32, 32, 2, false, 2880, 32, 2880)); + test_cases.emplace_back(new test_mul_mat_id(GGML_TYPE_Q4_0, GGML_TYPE_F32, 32, 2, false, 2880, 32, 2880)); for (ggml_type type_a : all_types) { test_cases.emplace_back(new test_mul_mat_id(type_a, GGML_TYPE_F32, 4, 2, false, 64, 16, 3*ggml_blck_size(type_a))); @@ -8643,6 +8696,12 @@ static std::vector> make_test_cases_eval() { 256, 16, 16, {ne2, 1}, {1, 1})); } + // nr2 sweep to cover the cublasSgemmBatched pointer-array path (dps2 > 1) + for (int64_t nr2 : {8, 16, 32}) { + test_cases.emplace_back(new test_out_prod(GGML_TYPE_F32, GGML_TYPE_F32, + 256, 16, 16, {1, 1}, {nr2, 1})); + } + // add_id for (ggml_type type_a : {GGML_TYPE_F32}) { for (ggml_type type_b : {GGML_TYPE_F32}) { @@ -8849,7 +8908,12 @@ static std::vector> make_test_cases_eval() { for (int v : { 0, 1, 2, 3 }) { for (int dim : { 0, 1, 2, 3, }) { test_cases.emplace_back(new test_concat(GGML_TYPE_F32, {11, 12, 13, 14}, 7, dim, v)); + test_cases.emplace_back(new test_concat(GGML_TYPE_F16, {11, 12, 13, 14}, 7, dim, v)); + test_cases.emplace_back(new test_concat(GGML_TYPE_BF16, {11, 12, 13, 14}, 7, dim, v)); + test_cases.emplace_back(new test_concat(GGML_TYPE_I8, {11, 12, 13, 14}, 7, dim, v)); + test_cases.emplace_back(new test_concat(GGML_TYPE_I16, {11, 12, 13, 14}, 7, dim, v)); test_cases.emplace_back(new test_concat(GGML_TYPE_I32, {11, 12, 13, 14}, 7, dim, v)); + test_cases.emplace_back(new test_concat(GGML_TYPE_I64, {11, 12, 13, 14}, 7, dim, v)); } } @@ -9264,6 +9328,34 @@ static std::vector> make_test_cases_perf() { } } + struct conv3d_perf_case { + int N, IC, ID, IH, IW, OC, KD, KH, KW, s0, s1, s2, p0, p1, p2, d0, d1, d2; + }; + + const std::vector conv3d_cases = { + {1, 320, 8, 38, 26, 1280, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + {1, 1280, 8, 38, 26, 1280, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + {1, 320, 8, 76, 52, 1280, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + {1, 1280, 8, 76, 52, 1280, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + {1, 320, 8, 152, 104, 1280, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1}, +#if 0 + // too slow on some devices + {1, 1280, 8, 152, 104, 1280, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + {1, 320, 4, 304, 208, 1280, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + {1, 640, 4, 304, 208, 1280, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1}, +#endif + }; + + for (ggml_type kernel_type : {GGML_TYPE_F32, GGML_TYPE_F16}) { + for (const conv3d_perf_case & c : conv3d_cases) { + test_cases.emplace_back(new test_conv_3d( + c.N, c.IC, c.ID, c.IH, c.IW, + c.OC, c.KD, c.KH, c.KW, + c.s0, c.s1, c.s2, c.p0, c.p1, c.p2, c.d0, c.d1, c.d2, + kernel_type)); + } + } + test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 1, 1, 1})); test_cases.emplace_back(new test_bin_bcast(ggml_add, GGML_TYPE_F32, {4096, 1, 1, 1}, {1, 512, 1, 1})); @@ -9872,7 +9964,7 @@ static void usage(char ** argv) { printf(" --output specifies output format (default: console, options: console, sql, csv)\n"); printf(" --list-ops lists all available GGML operations\n"); printf(" --show-coverage shows test coverage\n"); - printf(" --test-file reads test operators from a test file generated by llama-export-graph-ops\n"); + printf(" --test-file reads test operators from a test file generated by test-export-graph-ops\n"); printf(" -j runs tests using parallel worker threads (default: 1, test mode only)\n"); } diff --git a/tests/test-chat-auto-parser.cpp b/tests/test-chat-auto-parser.cpp index 6f8e957489c2..5cc1057532c3 100644 --- a/tests/test-chat-auto-parser.cpp +++ b/tests/test-chat-auto-parser.cpp @@ -1369,7 +1369,7 @@ static void test_nemotron_tool_format(testing & t) { // Check argument markers (note: markers retain trailing newlines for proper parsing) t.assert_equal("arg_name_prefix should be '\\n'", ">\n", analysis.tools.arguments.name_suffix); - t.assert_equal("arg_value_suffix should be '\\n'", "\n", analysis.tools.arguments.value_suffix); + t.assert_equal("arg_value_suffix should be '\\n\\n'", "\n\n", analysis.tools.arguments.value_suffix); // Check format classification t.assert_true("tool format should be TAG_WITH_TAGGED", analysis.tools.format.mode == tool_format::TAG_WITH_TAGGED); @@ -2030,12 +2030,11 @@ static void test_tagged_args_with_embedded_quotes(testing & t) { return p.content(p.until("")) + p.optional(tool_section) + p.end(); }); - // The exact input from the failing test std::string input = "\n" "\n" - "\n" - "foo.cpp\n" + "" + "foo.cpp" "\n" "" "def foo(arg = \"14\"):\n" diff --git a/tests/test-chat-template.cpp b/tests/test-chat-template.cpp index c388dee1c453..d971b23746c7 100644 --- a/tests/test-chat-template.cpp +++ b/tests/test-chat-template.cpp @@ -135,7 +135,7 @@ int main(int argc, char ** argv) { output_path = args[i + 1]; i++; } else if (args[i] == "--no-common") { - use_common = true; + use_common = false; } else if (tmpl_path.empty()) { tmpl_path = args[i]; } else { diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp index c1be9eb5a99f..c38aed8cfeb5 100644 --- a/tests/test-chat.cpp +++ b/tests/test-chat.cpp @@ -1562,37 +1562,112 @@ static void test_msgs_oaicompat_json_conversion() { } } -static void test_split_by_role() { +static void test_msg_token_delimiters_split() { LOG_DBG("%s\n", __func__); + // Delimiters that share a leading token, distinguished by the second token, + // to exercise the per-position token matching. + const common_chat_msg_delimiters delims = { + { { COMMON_CHAT_ROLE_USER, "", { 10, 11 } }, + { COMMON_CHAT_ROLE_ASSISTANT, "", { 10, 12 } } } + }; + // Empty inputs - assert_equals(0, common_chat_split_by_role("", {}).size()); - assert_equals(0, common_chat_split_by_role("hello", {}).size()); - assert_equals(0, common_chat_split_by_role("", { { "user", "<|user|>" } }).size()); + assert_equals(0, common_chat_msg_delimiters{}.split({}).spans.size()); + assert_equals(0, common_chat_msg_delimiters{}.split({ 10, 11 }).spans.size()); + assert_equals(0, delims.split({}).spans.size()); + + // No delimiters match -> no spans + assert_equals(0, delims.split({ 100, 101, 102 }).spans.size()); + + // Multi-role conversation: HiHelloBye + { + const llama_tokens tokens = { + 10, 11, // + 100, 101, // Hi + 10, 12, // + 200, 201, 202, // Hello + 10, 11, // + 300, 301, // Bye + }; + + const auto result = delims.split(tokens); + const auto & spans = result.spans; + assert_equals(3, spans.size()); + + assert_equals(COMMON_CHAT_ROLE_USER, spans[0].role); + assert_equals(0, spans[0].pos); + assert_equals(4, spans[0].len); + + assert_equals(COMMON_CHAT_ROLE_ASSISTANT, spans[1].role); + assert_equals(4, spans[1].pos); + assert_equals(5, spans[1].len); + + assert_equals(COMMON_CHAT_ROLE_USER, spans[2].role); + assert_equals(9, spans[2].pos); + assert_equals(4, spans[2].len); + + // is_user_start() is true at the token position where a user span begins + assert_equals(true, result.is_user_start(0)); + assert_equals(false, result.is_user_start(4)); // assistant span + assert_equals(true, result.is_user_start(9)); + } + + // Content before the first delimiter is not captured as a span + { + const llama_tokens tokens = { + 500, 501, // leading content (dropped) + 10, 11, // + 100, // Hi + }; + + const auto spans = delims.split(tokens).spans; + assert_equals(1, spans.size()); + assert_equals(COMMON_CHAT_ROLE_USER, spans[0].role); + assert_equals(2, spans[0].pos); + assert_equals(3, spans[0].len); + } + + // Skipped regions (media chunks) are jumped over but still count as span content + { + const llama_tokens tokens = { + 10, 11, // + LLAMA_TOKEN_NULL, // media chunk (3 tokens) + LLAMA_TOKEN_NULL, + LLAMA_TOKEN_NULL, + 100, // Hi + 10, 12, // + }; + + const std::map skips = { { 2, 3 } }; - // Multi-role conversation, no leading/trailing content + const auto spans = delims.split(tokens, skips).spans; + assert_equals(2, spans.size()); + + assert_equals(COMMON_CHAT_ROLE_USER, spans[0].role); + assert_equals(0, spans[0].pos); + assert_equals(6, spans[0].len); + + assert_equals(COMMON_CHAT_ROLE_ASSISTANT, spans[1].role); + assert_equals(6, spans[1].pos); + assert_equals(2, spans[1].len); + } + + // A delimiter sequence inside a skipped region is not matched { - const std::string prompt = "<|user|>Hi<|assistant|>Hello<|user|>Bye"; - const auto splits = common_chat_split_by_role(prompt, { - { "user", "<|user|>" }, - { "assistant", "<|assistant|>" }, - }); - assert_equals(3, splits.size()); - - assert_equals("user", splits[0].role); - assert_equals(0, splits[0].pos); - assert_equals(10, splits[0].len); - assert_equals("<|user|>Hi", prompt.substr(splits[0].pos, splits[0].len)); - - assert_equals("assistant", splits[1].role); - assert_equals(10, splits[1].pos); - assert_equals(18, splits[1].len); - assert_equals("<|assistant|>Hello", prompt.substr(splits[1].pos, splits[1].len)); - - assert_equals("user", splits[2].role); - assert_equals(28, splits[2].pos); - assert_equals(11, splits[2].len); - assert_equals("<|user|>Bye", prompt.substr(splits[2].pos, splits[2].len)); + const llama_tokens tokens = { + 10, 11, // + 10, 12, // skipped region that happens to contain delimiter tokens + 100, // Hi + }; + + const std::map skips = { { 2, 2 } }; + + const auto spans = delims.split(tokens, skips).spans; + assert_equals(1, spans.size()); + assert_equals(COMMON_CHAT_ROLE_USER, spans[0].role); + assert_equals(0, spans[0].pos); + assert_equals(5, spans[0].len); } } @@ -1882,11 +1957,29 @@ static void test_lfm2_parser(const std::string & template_path, bool detailed_de .expect(simple_assist_msg("Use this format: [link text](url). Example: [Wikipedia](https://www.wikipedia.org).")) .run(); - // Python tool with multiline code in string + // Python tool with multiline code in string: the \n in the literal decodes to a real + // newline, emitted as a JSON \n escape (not a doubled backslash). tst.test("<|tool_call_start|>[python(code=\"def hello():\\n print('hey')\")]<|tool_call_end|>") .tools({ python_tool }) .expect_tool_calls({ - { "python", R"#({"code": "def hello():\\n print('hey')"})#", "" } + { "python", R"#({"code": "def hello():\n print('hey')"})#", "" } + }) + .run(); + + // String escape sequences decode to their actual characters (newline + tab here), + // so a "write a two line file" style call produces real line breaks, not literal "\n". + tst.test("<|tool_call_start|>[python(code=\"First line\\nSecond line\\tindented\")]<|tool_call_end|>") + .tools({ python_tool }) + .expect_tool_calls({ + { "python", R"#({"code": "First line\nSecond line\tindented"})#", "" } + }) + .run(); + + // Escaped quotes inside a string argument survive the round-trip. + tst.test("<|tool_call_start|>[python(code=\"print(\\\"hi\\\")\")]<|tool_call_end|>") + .tools({ python_tool }) + .expect_tool_calls({ + { "python", R"#({"code": "print(\"hi\")"})#", "" } }) .run(); @@ -1935,6 +2028,10 @@ static void test_template_output_peg_parsers(bool detailed_debug) { } })"; + const char * const_schema = R"({ + "const": "42" + })"; + { // Qwen3.5 (basically same as Nemotron, but keeping separate tests just in case) auto tst = peg_tester("models/templates/Qwen3.5-4B.jinja", detailed_debug); @@ -2020,6 +2117,80 @@ static void test_template_output_peg_parsers(bool detailed_debug) { }) .run(); + tst.test( + "\n" + "\n" + "\n" + "foo.c\n" + "\n" + "\n" + "#iclunde\n" + "\n" + "\n" + "#include\n" + "\n" + "\n" + "") + .enable_thinking(false) + .reasoning_format(COMMON_REASONING_FORMAT_AUTO) + .tools({ + edit_tool + }) + .expect_tool_calls({ + { "edit", "{\"filename\": \"foo.c\", \"oldString\": \"#iclunde\", \"newString\": \"#include\"}", {} }, + }) + .run(); + + // a parameter value that itself ends in a newline (e.g. a source file with a + // trailing newline). The structural delimiter is "\n\n", so the value + // "#include\n" renders as "...#include\n\n\n". The trailing newline must + // be preserved faithfully (no stripping), and the generated grammar must admit a + // value ending on a delimiter prefix. Regression test for gbnf_excluding_pattern. + tst.test( + "\n" + "\n" + "\n" + "foo.c\n" + "\n" + "\n" + "#iclunde\n" + "\n" + "\n" + "#include\n" + "\n" + "\n" + "\n" + "") + .enable_thinking(false) + .reasoning_format(COMMON_REASONING_FORMAT_AUTO) + .tools({ + edit_tool + }) + .expect_tool_calls({ + { "edit", "{\"filename\": \"foo.c\", \"oldString\": \"#iclunde\", \"newString\": \"#include\\n\"}", {} }, + }) + .run(); + + + // test code that starts with indent + tst.test( + "\n" + "\n" + "\n" + " print(\"Hello, world!\")\n" + "\n" + "\n" + "") + .enable_thinking(false) + .reasoning_format(COMMON_REASONING_FORMAT_AUTO) + .tools({ + python_tool + }) + .expect_tool_calls({ + { "python", "{\"code\": \" print(\\\"Hello, world!\\\")\"}", {} }, + }) + .run(); + tst.test( "I need to output the invoice details in JSON\n" "\n\n" @@ -2644,6 +2815,100 @@ static void test_template_output_peg_parsers(bool detailed_debug) { .run(); } + { + // Cohere2 MoE (North Code) - dedicated parser. + // Marker-wrapped format: <|START_THINKING|>...<|END_THINKING|> then either + // <|START_TEXT|>...<|END_TEXT|> (content) or <|START_ACTION|>[json]<|END_ACTION|> (tools). + // The generation prompt forces a leading <|START_THINKING|>, so model output begins inside + // the thinking block: test inputs start with the reasoning body, not the <|START_THINKING|> tag. + auto tst = peg_tester("models/templates/Cohere2MoE.jinja", detailed_debug); + + // Content with reasoning, extracted. + tst.test("I'm\nthinking<|END_THINKING|><|START_TEXT|>Hello, world!\nWhat's up?<|END_TEXT|>") + .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK) + .expect(message_assist_thoughts) + .run(); + + // Content with reasoning, reasoning_format=NONE -> thinking kept inline in content (markers preserved). + tst.test("I'm\nthinking<|END_THINKING|><|START_TEXT|>Hello, world!\nWhat's up?<|END_TEXT|>") + .expect(message_assist_thoughts_unparsed_r7b) + .run(); + + // Content with empty thinking block. + tst.test("<|END_THINKING|><|START_TEXT|>Hello, world!\nWhat's up?<|END_TEXT|>") + .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK) + .expect(message_assist) + .run(); + + // Single tool call with reasoning. + tst.test( + "I'm\nthinking<|END_THINKING|>" + "<|START_ACTION|>[\n" + " {\"tool_call_id\": \"0\", \"tool_name\": \"special_function\", \"parameters\": {\"arg1\": 1}}\n" + "]<|END_ACTION|>") + .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK) + .tools({ special_function_tool }) + .expect(message_assist_thoughts_call_idx) + .run(); + + // Single tool call, empty thinking block (no reasoning content). + tst.test( + "<|END_THINKING|>" + "<|START_ACTION|>[\n" + " {\"tool_call_id\": \"0\", \"tool_name\": \"special_function\", \"parameters\": {\"arg1\": 1}}\n" + "]<|END_ACTION|>") + .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK) + .tools({ special_function_tool }) + .expect(message_assist_call_idx) + .run(); + + // Tool call with an array argument (todo_list). + tst.test( + "<|END_THINKING|>" + "<|START_ACTION|>[\n" + " {\"tool_call_id\": \"0\", \"tool_name\": \"todo_list\", \"parameters\": {\"todos\": [\"buy milk\", \"walk dog\"]}}\n" + "]<|END_ACTION|>") + .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK) + .tools({ todo_list }) + .expect(simple_assist_msg("", "", "todo_list", "{\"todos\": [\"buy milk\", \"walk dog\"]}", "0")) + .run(); + + // Parallel tool calls with reasoning. + tst.test( + "I'm\nthinking<|END_THINKING|>" + "<|START_ACTION|>[\n" + " {\"tool_call_id\": \"0\", \"tool_name\": \"special_function\", \"parameters\": {\"arg1\": 1}},\n" + " {\"tool_call_id\": \"1\", \"tool_name\": \"python\", \"parameters\": {\"code\": \"print('hey')\"}}\n" + "]<|END_ACTION|>") + .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK) + .parallel_tool_calls(true) + .tools({ special_function_tool, python_tool }) + .expect_reasoning("I'm\nthinking") + .expect_tool_calls({ + { "special_function", R"({"arg1": 1})", "0" }, + { "python", "{\"code\": \"print('hey')\"}", "1" }, + }) + .run(); + + // Tools available but the model answers with content instead of calling a tool. + tst.test("I'm\nthinking<|END_THINKING|><|START_TEXT|>Hello, world!\nWhat's up?<|END_TEXT|>") + .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK) + .tools({ special_function_tool }) + .expect(message_assist_thoughts) + .run(); + + // Partial tool call (streaming): name/id resolved before arguments arrive. + tst.test( + "I'm\nthinking<|END_THINKING|>" + "<|START_ACTION|>[\n" + " {\"tool_call_id\": \"0\", \"tool_name\": \"special_function\", ") + .reasoning_format(COMMON_REASONING_FORMAT_DEEPSEEK) + .tools({ special_function_tool }) + .is_partial(true) + .expect(message_assist_thoughts_partial_call) + .run(); + } + { // Google Gemma 2 2B - does not support tool calling auto tst = peg_tester("models/templates/google-gemma-2-2b-it.jinja"); @@ -3102,18 +3367,16 @@ static void test_template_output_peg_parsers(bool detailed_debug) { tst.test( "\n" "\n" - "\n" - "foo.cpp\n" + "" + "foo.cpp" "\n" "" "def foo(arg = \"14\"):\n" " return arg + \"bar\"\n" - "\n" "\n" "" "def foo(arg = \"15\"):\n" " pass\n" - "\n" "\n" "\n" "") @@ -4833,6 +5096,20 @@ static void test_template_output_peg_parsers(bool detailed_debug) { auto tst = peg_tester("models/templates/meta-llama-Llama-3.1-8B-Instruct.jinja", detailed_debug); tst.test("Hello, world!\nWhat's up?").tools({ special_function_tool }).expect(message_assist).expect_reconstruction().run(); + tst.test( + "```json\n\"42\"\n```") + .reasoning_format(COMMON_REASONING_FORMAT_AUTO) + .json_schema(const_schema) + .expect_content(R"("42")") + .run(); + + tst.test( + "\"42\"\n") + .reasoning_format(COMMON_REASONING_FORMAT_AUTO) + .json_schema(const_schema) + .expect_content(R"("42")") + .run(); + // Continuation tests tst.test("world!\nWhat's up?") .messages({ message_user, message_assist_prefill_content }) @@ -5655,7 +5932,7 @@ int main(int argc, char ** argv) { { test_msg_diffs_compute(); test_msgs_oaicompat_json_conversion(); - test_split_by_role(); + test_msg_token_delimiters_split(); test_tools_oaicompat_json_conversion(); test_convert_responses_to_chatcmpl(); test_developer_role_to_system_workaround(); diff --git a/tests/export-graph-ops.cpp b/tests/test-export-graph-ops.cpp similarity index 98% rename from tests/export-graph-ops.cpp rename to tests/test-export-graph-ops.cpp index 64cf6dcea353..7d8118dcd686 100644 --- a/tests/export-graph-ops.cpp +++ b/tests/test-export-graph-ops.cpp @@ -185,7 +185,7 @@ int main(int argc, char ** argv) { return 1; } #else - LOG_ERR("export-graph-ops compiled without HF fetch support\n"); + LOG_ERR("test-export-graph-ops compiled without HF fetch support\n"); return 1; #endif } diff --git a/tests/test-jinja.cpp b/tests/test-jinja.cpp index b5ee53461e8d..81bbcd55a46e 100644 --- a/tests/test-jinja.cpp +++ b/tests/test-jinja.cpp @@ -435,6 +435,24 @@ static void test_expressions(testing & t) { "('c', 'b', 'a')" ); + test_template(t, "string slice negative step", + "{{ 'abcdef'[::-2] }}", + json::object(), + "fdb" + ); + + test_template(t, "string slice negative start and step", + "{{ 'abcdef'[-1:1:-1] }}", + json::object(), + "fedc" + ); + + test_template(t, "string slice negative start, stop and step", + "{{ 'abcdef'[-1:-5:-1] }}", + json::object(), + "fedc" + ); + test_template(t, "arithmetic", "{{ (a + b) * c }}", {{"a", 2}, {"b", 3}, {"c", 4}}, @@ -583,8 +601,8 @@ static void test_filters(testing & t) { "hello jinja" ); - test_template(t, "length list", - "{{ items|length }}", + test_template(t, "length (count alias) list", + "{{ items|count }}", {{"items", json::array({1, 2, 3})}}, "3" ); @@ -693,8 +711,8 @@ static void test_filters(testing & t) { "fallback" ); - test_template(t, "default with falsy value", - "{{ ''|default('fallback', true) }}", + test_template(t, "default (d alias) with falsy value", + "{{ ''|d('fallback', true) }}", json::object(), "fallback" ); @@ -977,6 +995,32 @@ static void test_macros(testing & t) { json::object(), "Hello, John Smith,Hi, Jane Doe" ); + + test_template(t, "macro with caller", + "\ +{%- macro nest_dict(o, i, ff='') %}\n\ + {{- caller(ff) }}\n\ + {%- for k, v in o|items %}\n\ + {{- i + k + ': ' }}\n\ + {%- if v is mapping %}\n\ + {{- '{' }}\n\ + {% call(f) nest_dict(v, i + ' ') %}\n\ + {{- 'fail' if ff is undefined }}\n\ + {%- endcall %}\n\ + {{- i + '}' }}\n\ + {% else %}\n\ + {{- v|string }}\n\ + {% endif %}\n\ + {%- endfor %}\n\ +{%- endmacro %}\n\ +{%- call(f) nest_dict({'root1': 1, 'root2': {'nest1': 1, 'nest2': {'nest3': 2}}}, ' ', 'Dict') %}\n\ + {{- 'fail' if ff is defined }}\n\ + {{- f + ' {' }}\n\ +{% endcall %}\n\ +{{- '}' }}", + json::object(), + "Dict {\n root1: 1\n root2: {\n nest1: 1\n nest2: {\n nest3: 2\n }\n }\n}" + ); } static void test_namespace(testing & t) { @@ -1320,6 +1364,12 @@ static void test_string_methods(testing & t) { "hello jinja" ); + test_template(t, "string.replace() empty", + "{{ s.replace('', '.') }}", + {{"s", "hello world"}}, + ".h.e.l.l.o. .w.o.r.l.d." + ); + test_template(t, "string.replace() with count", "{{ s.replace('a', 'X', 2) }}", {{"s", "banana"}}, diff --git a/tests/test-json-partial.cpp b/tests/test-json-partial.cpp deleted file mode 100644 index 39da9276ef45..000000000000 --- a/tests/test-json-partial.cpp +++ /dev/null @@ -1,287 +0,0 @@ -#include "common.h" -#include "json-partial.h" -#include -#include -#include - -template static void assert_equals(const T & expected, const T & actual) { - if (expected != actual) { - std::cerr << "Expected: " << expected << std::endl; - std::cerr << "Actual: " << actual << std::endl; - std::cerr << std::flush; - throw std::runtime_error("Test failed"); - } -} - -static void test_json_healing() { - auto parse = [](const std::string & str) { - std::cerr << "# Parsing: " << str << '\n'; - std::string::const_iterator it = str.begin(); - const auto end = str.end(); - common_json out; - std::string healing_marker = "$llama.cpp.json$"; - if (common_json_parse(it, end, healing_marker, out)) { - auto dump = out.json.dump(); - std::cerr << "Parsed: " << dump << '\n'; - std::cerr << "Magic: " << out.healing_marker.json_dump_marker << '\n'; - std::string result; - if (!out.healing_marker.json_dump_marker.empty()) { - auto i = dump.find(out.healing_marker.json_dump_marker); - if (i == std::string::npos) { - throw std::runtime_error("Failed to find magic in dump " + dump + " (magic: " + out.healing_marker.json_dump_marker + ")"); - } - result = dump.substr(0, i); - } else { - result = dump; - } - std::cerr << "Result: " << result << '\n'; - if (string_starts_with(str, result)) { - std::cerr << "Failure!\n"; - } - // return dump; - } else { - throw std::runtime_error("Failed to parse: " + str); - } - - }; - auto parse_all = [&](const std::string & str) { - for (size_t i = 1; i < str.size(); i++) { - parse(str.substr(0, i)); - } - }; - parse_all("{\"a\": \"b\"}"); - parse_all("{\"hey\": 1, \"ho\\\"ha\": [1]}"); - - parse_all("[{\"a\": \"b\"}]"); - - auto test = [&](const std::vector & inputs, const std::string & expected, const std::string & expected_marker) { - for (const auto & input : inputs) { - common_json out; - assert_equals(true, common_json_parse(input, "$foo", out)); - assert_equals(expected, out.json.dump(/* indent */ -1, /* indent_char */ ' ', /* ensure_ascii */ true)); - assert_equals(expected_marker, out.healing_marker.json_dump_marker); - } - }; - // No healing needed: - test( - { - R"([{"a":"b"}, "y"])", - }, - R"([{"a":"b"},"y"])", - "" - ); - // Partial literals can't be healed: - test( - { - R"([1)", - R"([tru)", - R"([n)", - R"([nul)", - R"([23.2)", - }, - R"(["$foo"])", - R"("$foo)" - ); - test( - { - R"({"a": 1)", - R"({"a": tru)", - R"({"a": n)", - R"({"a": nul)", - R"({"a": 23.2)", - }, - R"({"a":"$foo"})", - R"("$foo)" - ); - test( - { - R"({)", - }, - R"({"$foo":1})", - R"("$foo)" - ); - test( - { - R"([)", - }, - R"(["$foo"])", - R"("$foo)" - ); - // Healing right after a full literal - test( - { - R"(1 )", - }, - R"(1)", - "" - ); - test( - { - R"(true)", - R"(true )", - }, - R"(true)", - "" - ); - test( - { - R"(null)", - R"(null )", - }, - R"(null)", - "" - ); - test( - { - R"([1 )", - }, - R"([1,"$foo"])", - R"(,"$foo)" - ); - test( - { - R"([{})", - R"([{} )", - }, - R"([{},"$foo"])", - R"(,"$foo)" - ); - test( - { - R"([true)", - }, - // TODO: detect the true/false/null literal was complete - R"(["$foo"])", - R"("$foo)" - ); - test( - { - R"([true )", - }, - R"([true,"$foo"])", - R"(,"$foo)" - ); - test( - { - R"([true,)", - }, - R"([true,"$foo"])", - R"("$foo)" - ); - // Test nesting - test( - { - R"([{"a": [{"b": [{)", - }, - R"([{"a":[{"b":[{"$foo":1}]}]}])", - R"("$foo)" - ); - test( - { - R"([{"a": [{"b": [)", - }, - R"([{"a":[{"b":["$foo"]}]}])", - R"("$foo)" - ); - - test( - { - R"([{"a": "b"})", - R"([{"a": "b"} )", - }, - R"([{"a":"b"},"$foo"])", - R"(,"$foo)" - ); - test( - { - R"([{"a": "b"},)", - R"([{"a": "b"}, )", - }, - R"([{"a":"b"},"$foo"])", - R"("$foo)" - ); - test( - { - R"({ "code)", - }, - R"({"code$foo":1})", - R"($foo)" - ); - test( - { - R"({ "code\)", - }, - R"({"code\\$foo":1})", - R"(\$foo)" - ); - test( - { - R"({ "code")", - }, - R"({"code":"$foo"})", - R"(:"$foo)" - ); - test( - { - R"({ "key")", - }, - R"({"key":"$foo"})", - R"(:"$foo)" - ); - // Test unicode escape sequences - test( - { - R"({"a":"\u)", - }, - R"({"a":"\u0000$foo"})", - R"(0000$foo)" - ); - test( - { - R"({"a":"\u00)", - }, - R"({"a":"\u0000$foo"})", - R"(00$foo)" - ); - test( - { - R"({"a":"\ud300)", - }, - R"({"a":"\ud300$foo"})", - R"($foo)" - ); - test( - { - R"({"a":"\ud800)", - }, - R"({"a":"\ud800\udc00$foo"})", - R"(\udc00$foo)" - ); - test( - { - R"({"a":"\ud800\)", - }, - R"({"a":"\ud800\udc00$foo"})", - R"(udc00$foo)" - ); - test( - { - R"({"a":"\ud800\u)", - }, - R"({"a":"\ud800\udc00$foo"})", - R"(dc00$foo)" - ); - test( - { - R"({"a":"\ud800\udc00)", - }, - R"({"a":"\ud800\udc00$foo"})", - R"($foo)" - ); -} - -int main() { - test_json_healing(); - std::cerr << "All tests passed.\n"; - return 0; -} diff --git a/tests/test-json-schema-to-grammar.cpp b/tests/test-json-schema-to-grammar.cpp index b4362852c392..f095274cd114 100755 --- a/tests/test-json-schema-to-grammar.cpp +++ b/tests/test-json-schema-to-grammar.cpp @@ -92,7 +92,7 @@ static void test_all(const std::string & lang, std::function pattern; pattern.reserve(n_layer); for (uint32_t il = 0; il < n_layer; il++) { @@ -322,6 +322,7 @@ static std::vector get_logits( static bool moe_mandatory(const llm_arch arch) { switch (arch) { case LLM_ARCH_LLAMA4: + case LLM_ARCH_COHERE2MOE: case LLM_ARCH_GROK: case LLM_ARCH_QWEN2MOE: case LLM_ARCH_QWEN3MOE: @@ -450,6 +451,9 @@ static int save_models(const llm_arch target_arch, const size_t seed, const ggml if (arch == LLM_ARCH_GEMMA4 || arch == LLM_ARCH_GEMMA4_ASSISTANT) { continue; // FIXME: ISWA KV cache initialization needs more fixture params } + if (arch == LLM_ARCH_EAGLE3) { + continue; + } for (bool moe : {false, true}) { if (moe && !moe_implemented(arch)) { continue; @@ -553,6 +557,9 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg if (arch == LLM_ARCH_GEMMA4 || arch == LLM_ARCH_GEMMA4_ASSISTANT) { continue; // FIXME: ISWA KV cache initialization needs more fixture params } + if (arch == LLM_ARCH_EAGLE3) { + continue; + } const bool encode = arch == LLM_ARCH_T5 || arch == LLM_ARCH_DREAM || arch == LLM_ARCH_LLADA || arch == LLM_ARCH_LLADA_MOE || arch == LLM_ARCH_RND1; for (bool moe : {false, true}) { diff --git a/tests/test-quantize-fns.cpp b/tests/test-quantize-fns.cpp index a05fab50421f..9510ac14ce00 100644 --- a/tests/test-quantize-fns.cpp +++ b/tests/test-quantize-fns.cpp @@ -102,21 +102,34 @@ static float dot_product_error(const ggml_type_traits * qfns, const ggml_type_tr return fabsf(result - dot_ref) / test_size; } -int main(int argc, char * argv[]) { - bool verbose = false; - const size_t test_size = 32 * 128; - - std::string arg; - for (int i = 1; i < argc; i++) { - arg = argv[i]; - - if (arg == "-v") { - verbose = true; - } else { - fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); - return 1; +static int test_vec_dot_f32(bool verbose) { + const auto * f32 = ggml_get_type_traits_cpu(GGML_TYPE_F32); + int num_failed = 0; + for (int n : {1, 2, 3, 5, 7, 8, 15, 16, 17, 31, 33, 63, 67, 127, 129, 193, 255, 1023}) { + std::vector a(n); + std::vector b(n); + generate_data(0.0, n, a.data()); + generate_data(1.0, n, b.data()); + + float result = 0.0f; + f32->vec_dot(n, &result, 0, a.data(), 0, b.data(), 0, 1); + const float ref = dot_product(a.data(), b.data(), n); + const float error = fabsf(result - ref) / n; + + const bool failed = !(error < MAX_QUANTIZATION_REFERENCE_ERROR); + num_failed += failed; + if (failed || verbose) { + printf(" f32 vec_dot n=%4d: %s (ref=%f got=%f err=%f)\n", + n, RESULT_STR[failed], ref, result, error); } } + return num_failed; +} + +static int test_vec_dot_q(bool verbose) { + int num_failed = 0; + + const size_t test_size = 32 * 128; std::vector test_data(test_size); std::vector test_data2(test_size); @@ -124,11 +137,6 @@ int main(int argc, char * argv[]) { generate_data(0.0, test_data.size(), test_data.data()); generate_data(1.0, test_data2.size(), test_data2.data()); - ggml_cpu_init(); - - int num_failed = 0; - bool failed = false; - for (int i = 0; i < GGML_TYPE_COUNT; i++) { ggml_type type = (ggml_type) i; const auto * qfns = ggml_get_type_traits(type); @@ -150,13 +158,14 @@ int main(int argc, char * argv[]) { type == GGML_TYPE_Q1_0 ? MAX_QUANTIZATION_TOTAL_ERROR_BINARY : type == GGML_TYPE_TQ1_0 ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY : type == GGML_TYPE_TQ2_0 ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY : + type == GGML_TYPE_Q2_0 ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY : type == GGML_TYPE_Q2_K ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS : type == GGML_TYPE_IQ2_S ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS : type == GGML_TYPE_Q3_K ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS : type == GGML_TYPE_IQ3_S ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS : type == GGML_TYPE_IQ3_XXS ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS_XXS : type == GGML_TYPE_NVFP4 ? MAX_QUANTIZATION_TOTAL_ERROR_FP4 : MAX_QUANTIZATION_TOTAL_ERROR; - failed = !(total_error < max_quantization_error); + bool failed = !(total_error < max_quantization_error); num_failed += failed; if (failed || verbose) { printf("%5s absolute quantization error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], total_error); @@ -171,15 +180,15 @@ int main(int argc, char * argv[]) { const float vec_dot_error = dot_product_error(qfns, qfns_cpu, test_size, test_data.data(), test_data2.data()); const float max_allowed_error = type == GGML_TYPE_Q2_K || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ2_XXS || - type == GGML_TYPE_IQ3_XXS || type == GGML_TYPE_IQ3_S || type == GGML_TYPE_IQ2_S - ? MAX_DOT_PRODUCT_ERROR_LOWBIT - : type == GGML_TYPE_Q1_0 - ? MAX_DOT_PRODUCT_ERROR_BINARY - : type == GGML_TYPE_TQ1_0 || type == GGML_TYPE_TQ2_0 - ? MAX_DOT_PRODUCT_ERROR_TERNARY - : type == GGML_TYPE_NVFP4 - ? MAX_DOT_PRODUCT_ERROR_FP4 - : MAX_DOT_PRODUCT_ERROR; + type == GGML_TYPE_IQ3_XXS || type == GGML_TYPE_IQ3_S || type == GGML_TYPE_IQ2_S + ? MAX_DOT_PRODUCT_ERROR_LOWBIT + : type == GGML_TYPE_Q1_0 + ? MAX_DOT_PRODUCT_ERROR_BINARY + : type == GGML_TYPE_TQ1_0 || type == GGML_TYPE_TQ2_0 || type == GGML_TYPE_Q2_0 + ? MAX_DOT_PRODUCT_ERROR_TERNARY + : type == GGML_TYPE_NVFP4 + ? MAX_DOT_PRODUCT_ERROR_FP4 + : MAX_DOT_PRODUCT_ERROR; failed = !(vec_dot_error < max_allowed_error); num_failed += failed; if (failed || verbose) { @@ -188,6 +197,31 @@ int main(int argc, char * argv[]) { } } + return num_failed; +} + +int main(int argc, char * argv[]) { + bool verbose = false; + + std::string arg; + for (int i = 1; i < argc; i++) { + arg = argv[i]; + + if (arg == "-v") { + verbose = true; + } else { + fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); + return 1; + } + } + + ggml_cpu_init(); + + int num_failed = 0; + + num_failed += test_vec_dot_f32(verbose); + num_failed += test_vec_dot_q(verbose); + if (num_failed || verbose) { printf("%d tests failed\n", num_failed); } diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp index 7cd96c5cd351..2aecff90e7bb 100644 --- a/tests/test-sampling.cpp +++ b/tests/test-sampling.cpp @@ -360,9 +360,9 @@ int main(void) { test_dry({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 1}, {0.241818f, 0.241818f, 0.032727f, 0.241818f, 0.241818f}, 2.0f, 1.1f, 2, 5, {}); test_dry({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 3, 4, 0, 1}, {0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, 1.0f, 1.1f, 4, 7, {}); - test_top_n_sigma({0.1f, 0.2f, 0.3f, 0.4f}, {0.571429f, 0.428571f, 0.0f, 0.0f}, 1.00f); + test_top_n_sigma({0.1f, 0.2f, 0.3f, 0.4f}, {0.0f, 0.0f, 0.428571f, 0.571429f}, 1.00f); test_top_n_sigma({0.1f, 0.2f, 0.3f, 0.4f}, {0.1f, 0.2f, 0.3f, 0.4f}, 0.00f); // top_n_sigma == 0 now represents a no-op rather than greedy decoding as of PR#13345 - test_top_n_sigma({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 3.00f); + test_top_n_sigma({0.1f, 0.2f, 0.3f, 0.4f}, {0.1f, 0.2f, 0.3f, 0.4f}, 3.00f); test_sampler_queue(10000, "k", 10000, 1.0f, 1.0f); test_sampler_queue(10000, "k", 1, 1.0f, 1.0f); diff --git a/tests/test-thread-safety.cpp b/tests/test-thread-safety.cpp index acda4aa81eab..d0b5946e2c8a 100644 --- a/tests/test-thread-safety.cpp +++ b/tests/test-thread-safety.cpp @@ -146,6 +146,8 @@ int main(int argc, char ** argv) { } LOG_INF("Model %d/%d, Context %d/%d: %s\n\n", m + 1, num_models, c + 1, num_contexts, result.c_str()); + + llama_synchronize(ctx.get()); }); } } diff --git a/tools/cli/README.md b/tools/cli/README.md index b11aa45ce958..f93ae914ce27 100644 --- a/tools/cli/README.md +++ b/tools/cli/README.md @@ -161,7 +161,7 @@ | `-mmu, --mmproj-url URL` | URL to a multimodal projector file. see tools/mtmd/README.md
(env: LLAMA_ARG_MMPROJ_URL) | | `--mmproj-auto, --no-mmproj, --no-mmproj-auto` | whether to use multimodal projector file (if available), useful when using -hf (default: enabled)
(env: LLAMA_ARG_MMPROJ_AUTO) | | `--mmproj-offload, --no-mmproj-offload` | whether to enable GPU offloading for multimodal projector (default: enabled)
(env: LLAMA_ARG_MMPROJ_OFFLOAD) | -| `--image, --audio FILE` | path to an image or audio file. use with multimodal models, use comma-separated values for multiple files | +| `--image, --audio, --video FILE` | path to an image, audio, or video file. use with multimodal models, use comma-separated values for multiple files | | `--image-min-tokens N` | minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)
(env: LLAMA_ARG_IMAGE_MIN_TOKENS) | | `--image-max-tokens N` | maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)
(env: LLAMA_ARG_IMAGE_MAX_TOKENS) | | `--chat-template-kwargs STRING` | sets additional params for the json template parser, must be a valid json object string, e.g. '{"key1":"value1","key2":"value2"}'
(env: LLAMA_ARG_CHAT_TEMPLATE_KWARGS) | @@ -174,6 +174,7 @@ | `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek-ocr, deepseek2, deepseek3, exaone-moe, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, granite-4.0, granite-4.1, grok-2, hunyuan-dense, hunyuan-moe, hunyuan-vl, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, solar-open, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | | `--skip-chat-parsing, --no-skip-chat-parsing` | force a pure content parser, even if a Jinja template is specified; model will output everything in the content section, including any reasoning and/or tool calls (default: disabled)
(env: LLAMA_ARG_SKIP_CHAT_PARSING) | | `--simple-io` | use basic IO for better compatibility in subprocesses and limited consoles | +| `--log-prompts-dir PATH` | Log prompts to directory (only used for debugging, default: disabled) | | `--spec-draft-hf, -hfd, -hfrd, --hf-repo-draft /[:quant]` | Same as --hf-repo, but for the draft model (default: unused)
(env: LLAMA_ARG_SPEC_DRAFT_HF_REPO) | | `--spec-draft-threads, -td, --threads-draft N` | number of threads to use during generation (default: same as --threads) | | `--spec-draft-threads-batch, -tbd, --threads-batch-draft N` | number of threads to use during batch and prompt processing (default: same as --threads-draft) | diff --git a/tools/cli/cli.cpp b/tools/cli/cli.cpp index 3ed345bf0f0a..8b7b58693fc9 100644 --- a/tools/cli/cli.cpp +++ b/tools/cli/cli.cpp @@ -97,11 +97,18 @@ struct cli_context { task.params.chat_parser_params.parser.load(chat_params.parser); } + // Copy the preserved tokens into the sampling params + const llama_vocab * vocab = llama_model_get_vocab( + llama_get_model(ctx_server.get_llama_context())); + for (const auto & token : chat_params.preserved_tokens) { + auto ids = common_tokenize(vocab, token, false, true); + if (ids.size() == 1) { + task.params.sampling.preserved_tokens.insert(ids[0]); + } + } + // reasoning budget sampler if (!chat_params.thinking_end_tag.empty()) { - const llama_vocab * vocab = llama_model_get_vocab( - llama_get_model(ctx_server.get_llama_context())); - task.params.sampling.reasoning_budget_tokens = defaults.sampling.reasoning_budget_tokens; task.params.sampling.generation_prompt = chat_params.generation_prompt; @@ -195,7 +202,7 @@ struct cli_context { // TODO: support remote files in the future (http, https, etc) std::string load_input_file(const std::string & fname, bool is_media) { - std::ifstream file(fname, std::ios::binary); + std::ifstream file = fs_open_ifstream(fname, std::ios::binary); if (!file) { return ""; } diff --git a/tools/export-lora/README.md b/tools/export-lora/README.md index 7dce99c9a9e6..f0729341f3ad 100644 --- a/tools/export-lora/README.md +++ b/tools/export-lora/README.md @@ -6,11 +6,10 @@ Apply LORA adapters to base model and export the resulting model. usage: llama-export-lora [options] options: - -m, --model model path from which to load base model (default '') - --lora FNAME path to LoRA adapter (can be repeated to use multiple adapters) - --lora-scaled FNAME S path to LoRA adapter with user defined scaling S (can be repeated to use multiple adapters) - -t, --threads N number of threads to use during computation (default: 4) - -o, --output FNAME output file (default: 'ggml-lora-merged-f16.gguf') + -m, --model FNAME model path from which to load base model + --lora FNAME path to LoRA adapter (use comma-separated values to load multiple adapters) + --lora-scaled FNAME:SCALE,... path to LoRA adapter with user defined scaling (format: FNAME:SCALE,...) + -o, --output, --output-file FNAME output file (default: 'ggml-lora-merged-f16.gguf') ``` For example: @@ -22,12 +21,11 @@ For example: --lora lora-open-llama-3b-v2-english2tokipona-chat-LATEST.gguf ``` -Multiple LORA adapters can be applied by passing multiple `--lora FNAME` or `--lora-scaled FNAME S` command line parameters: +Multiple LORA adapters can be applied by passing comma-separated values to `--lora FNAME` or `--lora-scaled FNAME:SCALE,...`: ```bash ./bin/llama-export-lora \ -m your_base_model.gguf \ -o your_merged_model.gguf \ - --lora-scaled lora_task_A.gguf 0.5 \ - --lora-scaled lora_task_B.gguf 0.5 + --lora-scaled lora_task_A.gguf:0.5,lora_task_B.gguf:0.5 ``` diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp index a85f86c3ab26..2695f58785e2 100644 --- a/tools/llama-bench/llama-bench.cpp +++ b/tools/llama-bench/llama-bench.cpp @@ -323,6 +323,7 @@ struct cmd_params { std::vector hf_repo; std::vector hf_file; std::string hf_token; + bool offline; std::vector n_prompt; std::vector n_gen; std::vector> n_pg; @@ -367,6 +368,7 @@ static const cmd_params cmd_params_defaults = { /* hf_repo */ {}, /* hf_file */ {}, /* hf_token */ "", + /* offline */ false, /* n_prompt */ { 512 }, /* n_gen */ { 128 }, /* n_pg */ {}, @@ -437,6 +439,8 @@ static void print_usage(int /* argc */, char ** argv) { printf(" (default: unused)\n"); printf(" -hft, --hf-token Hugging Face access token\n"); printf(" (default: value from HF_TOKEN environment variable)\n"); + printf(" --offline Offline mode: forces use of cache, prevents network access\n"); + printf(" (default: disabled)\n"); printf(" -p, --n-prompt (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str()); printf(" -n, --n-gen (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str()); printf(" -pg (default: %s)\n", join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str()); @@ -558,6 +562,8 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { break; } params.hf_token = argv[i]; + } else if (arg == "--offline") { + params.offline = true; } else if (arg == "-p" || arg == "--n-prompt") { if (++i >= argc) { invalid_param = true; @@ -1029,24 +1035,23 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { if (!params.hf_repo.empty()) { for (size_t i = 0; i < params.hf_repo.size(); i++) { - common_params_model model; - - if (params.hf_file.empty() || params.hf_file[i].empty()) { - model.hf_repo = params.hf_repo[i]; - } else { - model.hf_repo = params.hf_repo[i]; - model.hf_file = params.hf_file[i]; + common_params p; + p.hf_token = params.hf_token; + p.offline = params.offline; + p.model.hf_repo = params.hf_repo[i]; + if (!params.hf_file.empty() && !params.hf_file[i].empty()) { + p.model.hf_file = params.hf_file[i]; } - common_download_opts opts; - opts.bearer_token = params.hf_token; - auto download_result = common_download_model(model, opts); - if (download_result.model_path.empty()) { + // only the text model file is needed + common_models_handler models_handler = common_models_handler_init(p, LLAMA_EXAMPLE_BENCH); + common_models_handler_apply(models_handler, p); + if (p.model.path.empty()) { fprintf(stderr, "error: failed to download model from HuggingFace\n"); exit(1); } - params.model.push_back(download_result.model_path); + params.model.push_back(p.model.path); } } diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt index 09b62357f33e..ea684d9f156d 100644 --- a/tools/mtmd/CMakeLists.txt +++ b/tools/mtmd/CMakeLists.txt @@ -115,22 +115,28 @@ if (TARGET mtmd) endif() endif() -add_executable(llama-llava-cli deprecation-warning.cpp) -add_executable(llama-gemma3-cli deprecation-warning.cpp) -add_executable(llama-minicpmv-cli deprecation-warning.cpp) -add_executable(llama-qwen2vl-cli deprecation-warning.cpp) - -set(TARGET llama-mtmd-cli) -add_executable (${TARGET} mtmd-cli.cpp) -set_target_properties (${TARGET} PROPERTIES OUTPUT_NAME llama-mtmd-cli) -if(LLAMA_TOOLS_INSTALL) - install(TARGETS ${TARGET} RUNTIME) +# Gate CLI binaries on LLAMA_BUILD_TOOLS so that standalone library-only +# builds (LLAMA_BUILD_MTMD=ON with LLAMA_BUILD_TOOLS=OFF — e.g. Apple +# XCFramework packaging) skip the executables entirely. LLAMA_BUILD_COMMON +# defaults to ON in standalone builds, so we cannot rely on it for gating. +if (LLAMA_BUILD_TOOLS) + add_executable(llama-llava-cli deprecation-warning.cpp) + add_executable(llama-gemma3-cli deprecation-warning.cpp) + add_executable(llama-minicpmv-cli deprecation-warning.cpp) + add_executable(llama-qwen2vl-cli deprecation-warning.cpp) + + set(TARGET llama-mtmd-cli) + add_executable (${TARGET} mtmd-cli.cpp) + set_target_properties (${TARGET} PROPERTIES OUTPUT_NAME llama-mtmd-cli) + if(LLAMA_TOOLS_INSTALL) + install(TARGETS ${TARGET} RUNTIME) + endif() + target_link_libraries (${TARGET} PRIVATE llama-common mtmd Threads::Threads) + target_compile_features(${TARGET} PRIVATE cxx_std_17) + + # mtmd-debug tool + add_executable(llama-mtmd-debug debug/mtmd-debug.cpp) + set_target_properties(llama-mtmd-debug PROPERTIES OUTPUT_NAME llama-mtmd-debug) + target_link_libraries(llama-mtmd-debug PRIVATE llama-common mtmd Threads::Threads) + target_compile_features(llama-mtmd-debug PRIVATE cxx_std_17) endif() -target_link_libraries (${TARGET} PRIVATE llama-common mtmd Threads::Threads) -target_compile_features(${TARGET} PRIVATE cxx_std_17) - -# mtmd-debug tool -add_executable(llama-mtmd-debug debug/mtmd-debug.cpp) -set_target_properties(llama-mtmd-debug PROPERTIES OUTPUT_NAME llama-mtmd-debug) -target_link_libraries(llama-mtmd-debug PRIVATE llama-common mtmd Threads::Threads) -target_compile_features(llama-mtmd-debug PRIVATE cxx_std_17) diff --git a/tools/mtmd/README-dev.md b/tools/mtmd/README-dev.md new file mode 100644 index 000000000000..3a08915876c1 --- /dev/null +++ b/tools/mtmd/README-dev.md @@ -0,0 +1,35 @@ +# libmtmd dev guide + +## History + +Please refer to [multimodal.md](../../docs/multimodal.md) for a broader context. + +In short: +- `libmtmd` started as a wrapper around `libllava` / `clip.cpp` +- Various components that used to be in `clip.cpp` are moved progressively to mtmd. For example, preprocessor is now part of mtmd + +## Terminologies + +- mtmd: **M**ul**T**i**M**o**D**al +- bitmap: representing a raw input data, for example: RGB image, PCM audio +- tiles / slices: for llava-uhd-style models, the preprocessor breaks a large input into smaller square images called tiles or slices +- chunk: a mtmd_input_chunk represents a preprocessed input that can then be passed through `mtmd_encode()` + +## Pipeline + +A typical pipeline of the core libmtmd is as follows: +- A bitmap (RGB image or PCM audio) is created +- Bitmap and the text prompt is provided to `mtmd_tokenize()` that breaks the input into chunks + - The tokenizer function first expands a "lazy" bitmap if it finds one. Typically, this is used by video, so that one media token corresponds to one input bitmap + - For models that support "fused" temporal frames like Qwen-VL, the tokenizer tries to merge pair of consecutive frames into one batch + - The preprocessor will then be called, which produces a list of chunks + - Depending on the model itself, special tokens will be injected to separate image chunks (i.e. llava-uhd-style models) +- Multiple bitmaps may be batched together to form a larger `mtmd_batch()` +- Single image or batch is encoded, via `mtmd_encode()` or `mtmd_batch_encode()` +- Get the output embeddings + +## Helper + +We provide a set of helper functions via `mtmd_helper` to make using libmtmd easier. The helper provides: +- Image, audio and video file decoding (for example, decode raw JPEG into RGB bitmap) +- Manage `llama_batch` and calls to `llama_decode` diff --git a/tools/mtmd/clip-graph.h b/tools/mtmd/clip-graph.h index 7d10586217b8..c84b32880b5d 100644 --- a/tools/mtmd/clip-graph.h +++ b/tools/mtmd/clip-graph.h @@ -54,6 +54,10 @@ struct clip_graph { virtual ggml_tensor * build_mm(ggml_tensor * w, ggml_tensor * x) const; // TODO: build_mm(w, b, x) to support bias + virtual bool support_batch() const { + return false; + } + // // utility functions // diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index b104f373618b..5b413681f040 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -13,6 +13,14 @@ #include #include #include +#include + +#ifdef _WIN32 +#ifndef NOMINMAX +#define NOMINMAX +#endif +#include +#endif // Internal header for clip.cpp @@ -34,6 +42,7 @@ #define KEY_N_HEAD "clip.%s.attention.head_count" #define KEY_N_HEAD_KV "clip.%s.attention.head_count_kv" #define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon" +#define KEY_FEATURE_LAYERS "clip.%s.feature_layer" // vision-specific #define KEY_VISION_PROJ_TYPE "clip.vision.projector_type" // for models with mixed modalities @@ -46,7 +55,6 @@ #define KEY_PATCH_SIZE "clip.vision.patch_size" #define KEY_IMAGE_MEAN "clip.vision.image_mean" #define KEY_IMAGE_STD "clip.vision.image_std" -#define KEY_FEATURE_LAYER "clip.vision.feature_layer" #define KEY_PROJ_SCALE_FACTOR "clip.vision.projector.scale_factor" #define KEY_PROJ_SAMPLE_QUERY_SIDE "clip.vision.projector.query_side" #define KEY_PROJ_SAMPLE_WINDOW_SIDE "clip.vision.projector.window_side" @@ -367,56 +375,56 @@ enum projector_type { }; static std::map PROJECTOR_TYPE_NAMES = { - { PROJECTOR_TYPE_MLP, "mlp" }, - { PROJECTOR_TYPE_LDP, "ldp" }, - { PROJECTOR_TYPE_LDPV2, "ldpv2"}, - { PROJECTOR_TYPE_MINICPMV, "resampler"}, - { PROJECTOR_TYPE_GLM_EDGE, "adapter"}, - { PROJECTOR_TYPE_QWEN2VL, "qwen2vl_merger"}, - { PROJECTOR_TYPE_QWEN25VL, "qwen2.5vl_merger"}, - { PROJECTOR_TYPE_QWEN3VL, "qwen3vl_merger"}, - { PROJECTOR_TYPE_STEP3VL, "step3vl"}, - { PROJECTOR_TYPE_GEMMA3, "gemma3"}, - { PROJECTOR_TYPE_GEMMA3NV, "gemma3nv"}, - { PROJECTOR_TYPE_GEMMA3NA, "gemma3na"}, - { PROJECTOR_TYPE_GEMMA4V, "gemma4v"}, - { PROJECTOR_TYPE_GEMMA4A, "gemma4a"}, - { PROJECTOR_TYPE_GEMMA4UV, "gemma4uv"}, - { PROJECTOR_TYPE_GEMMA4UA, "gemma4ua"}, - { PROJECTOR_TYPE_PHI4, "phi4"}, - { PROJECTOR_TYPE_IDEFICS3, "idefics3"}, - { PROJECTOR_TYPE_PIXTRAL, "pixtral"}, - { PROJECTOR_TYPE_ULTRAVOX, "ultravox"}, - { PROJECTOR_TYPE_INTERNVL, "internvl"}, - { PROJECTOR_TYPE_LLAMA4, "llama4"}, - { PROJECTOR_TYPE_QWEN2A, "qwen2a"}, - { PROJECTOR_TYPE_QWEN3A, "qwen3a"}, - { PROJECTOR_TYPE_GLMA, "glma"}, - { PROJECTOR_TYPE_QWEN25O, "qwen2.5o"}, - { PROJECTOR_TYPE_VOXTRAL, "voxtral"}, - { PROJECTOR_TYPE_MERALION, "meralion"}, - { PROJECTOR_TYPE_MUSIC_FLAMINGO, "musicflamingo"}, - { PROJECTOR_TYPE_LFM2, "lfm2"}, - { PROJECTOR_TYPE_KIMIVL, "kimivl"}, - { PROJECTOR_TYPE_PADDLEOCR, "paddleocr"}, - { PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"}, - { PROJECTOR_TYPE_COGVLM, "cogvlm"}, - { PROJECTOR_TYPE_JANUS_PRO, "janus_pro"}, - { PROJECTOR_TYPE_DOTS_OCR, "dots_ocr"}, - { PROJECTOR_TYPE_DEEPSEEKOCR,"deepseekocr"}, - { PROJECTOR_TYPE_DEEPSEEKOCR2,"deepseekocr2"}, - { PROJECTOR_TYPE_LFM2A, "lfm2a"}, - { PROJECTOR_TYPE_GLM4V, "glm4v"}, - { PROJECTOR_TYPE_YOUTUVL, "youtuvl"}, - { PROJECTOR_TYPE_YASA2, "yasa2"}, - { PROJECTOR_TYPE_KIMIK25, "kimik25"}, - { PROJECTOR_TYPE_NEMOTRON_V2_VL, "nemotron_v2_vl"}, - { PROJECTOR_TYPE_EXAONE4_5, "exaone4_5"}, - { PROJECTOR_TYPE_HUNYUANVL, "hunyuanvl"}, - { PROJECTOR_TYPE_MINICPMV4_6, "minicpmv4_6"}, - { PROJECTOR_TYPE_GRANITE_SPEECH, "granite_speech"}, - { PROJECTOR_TYPE_MIMOVL, "mimovl"}, - { PROJECTOR_TYPE_GRANITE4_VISION, "granite4_vision"}, + { PROJECTOR_TYPE_MLP, "mlp" }, + { PROJECTOR_TYPE_LDP, "ldp" }, + { PROJECTOR_TYPE_LDPV2, "ldpv2"}, + { PROJECTOR_TYPE_MINICPMV, "resampler"}, + { PROJECTOR_TYPE_GLM_EDGE, "adapter"}, + { PROJECTOR_TYPE_QWEN2VL, "qwen2vl_merger"}, + { PROJECTOR_TYPE_QWEN25VL, "qwen2.5vl_merger"}, + { PROJECTOR_TYPE_QWEN3VL, "qwen3vl_merger"}, + { PROJECTOR_TYPE_STEP3VL, "step3vl"}, + { PROJECTOR_TYPE_GEMMA3, "gemma3"}, + { PROJECTOR_TYPE_GEMMA3NV, "gemma3nv"}, + { PROJECTOR_TYPE_GEMMA3NA, "gemma3na"}, + { PROJECTOR_TYPE_GEMMA4V, "gemma4v"}, + { PROJECTOR_TYPE_GEMMA4A, "gemma4a"}, + { PROJECTOR_TYPE_GEMMA4UV, "gemma4uv"}, + { PROJECTOR_TYPE_GEMMA4UA, "gemma4ua"}, + { PROJECTOR_TYPE_PHI4, "phi4"}, + { PROJECTOR_TYPE_IDEFICS3, "idefics3"}, + { PROJECTOR_TYPE_PIXTRAL, "pixtral"}, + { PROJECTOR_TYPE_ULTRAVOX, "ultravox"}, + { PROJECTOR_TYPE_INTERNVL, "internvl"}, + { PROJECTOR_TYPE_LLAMA4, "llama4"}, + { PROJECTOR_TYPE_QWEN2A, "qwen2a"}, + { PROJECTOR_TYPE_QWEN3A, "qwen3a"}, + { PROJECTOR_TYPE_GLMA, "glma"}, + { PROJECTOR_TYPE_QWEN25O, "qwen2.5o"}, + { PROJECTOR_TYPE_VOXTRAL, "voxtral"}, + { PROJECTOR_TYPE_MERALION, "meralion"}, + { PROJECTOR_TYPE_MUSIC_FLAMINGO, "musicflamingo"}, + { PROJECTOR_TYPE_LFM2, "lfm2"}, + { PROJECTOR_TYPE_KIMIVL, "kimivl"}, + { PROJECTOR_TYPE_PADDLEOCR, "paddleocr"}, + { PROJECTOR_TYPE_LIGHTONOCR, "lightonocr"}, + { PROJECTOR_TYPE_COGVLM, "cogvlm"}, + { PROJECTOR_TYPE_JANUS_PRO, "janus_pro"}, + { PROJECTOR_TYPE_DOTS_OCR, "dots_ocr"}, + { PROJECTOR_TYPE_DEEPSEEKOCR, "deepseekocr"}, + { PROJECTOR_TYPE_DEEPSEEKOCR2, "deepseekocr2"}, + { PROJECTOR_TYPE_LFM2A, "lfm2a"}, + { PROJECTOR_TYPE_GLM4V, "glm4v"}, + { PROJECTOR_TYPE_YOUTUVL, "youtuvl"}, + { PROJECTOR_TYPE_YASA2, "yasa2"}, + { PROJECTOR_TYPE_KIMIK25, "kimik25"}, + { PROJECTOR_TYPE_NEMOTRON_V2_VL, "nemotron_v2_vl"}, + { PROJECTOR_TYPE_EXAONE4_5, "exaone4_5"}, + { PROJECTOR_TYPE_HUNYUANVL, "hunyuanvl"}, + { PROJECTOR_TYPE_MINICPMV4_6, "minicpmv4_6"}, + { PROJECTOR_TYPE_GRANITE_SPEECH, "granite_speech"}, + { PROJECTOR_TYPE_MIMOVL, "mimovl"}, + { PROJECTOR_TYPE_GRANITE4_VISION, "granite4_vision"}, }; static projector_type clip_projector_type_from_string(const std::string & str) { @@ -640,47 +648,18 @@ static void clip_log_internal(enum ggml_log_level level, const char * format, .. // cpp wrappers // -// wrapper for clip_image_size -struct clip_image_size_deleter { - void operator()(clip_image_size * val) { clip_image_size_free(val); } -}; -typedef std::unique_ptr clip_image_size_ptr; - -// wrapper for clip_image_u8 -struct clip_image_u8_deleter { - void operator()(clip_image_u8 * val) { clip_image_u8_free(val); } -}; -typedef std::unique_ptr clip_image_u8_ptr; - -// wrapper for clip_image_f32 -struct clip_image_f32_deleter { - void operator()(clip_image_f32 * val) { clip_image_f32_free(val); } -}; -typedef std::unique_ptr clip_image_f32_ptr; - -struct clip_image_u8_batch { - std::vector entries; -}; - struct clip_image_f32_batch { - std::vector entries; + std::vector entries; bool is_audio = false; - // for llava-uhd style models, we need to know the grid size - // note: entries.size() == grid_x * grid_y + 1 (one overview image) - int grid_x = 0; - int grid_y = 0; - clip_image_f32_batch clone() const { clip_image_f32_batch new_batch{ /* entries */ {}, /* is_audio */ is_audio, - /* grid_x */ grid_x, - /* grid_y */ grid_y, }; new_batch.entries.reserve(entries.size()); for (const auto & entry : entries) { - new_batch.entries.emplace_back(new clip_image_f32(*entry)); + new_batch.entries.emplace_back(entry); // copy } return new_batch; } @@ -690,6 +669,22 @@ struct clip_image_f32_batch { // common utils // +#ifdef _WIN32 +static std::ifstream open_ifstream_binary(const std::string & fname) { + int wlen = MultiByteToWideChar(CP_UTF8, 0, fname.c_str(), -1, NULL, 0); + if (!wlen) { + throw std::runtime_error("failed to convert filename to UTF-16: " + fname); + } + std::vector wfname(wlen); + (void)MultiByteToWideChar(CP_UTF8, 0, fname.c_str(), -1, wfname.data(), wlen); + return std::ifstream(wfname.data(), std::ios::binary); +} +#else +static std::ifstream open_ifstream_binary(const std::string & fname) { + return std::ifstream(fname, std::ios::binary); +} +#endif + static std::string string_format(const char * fmt, ...) { va_list ap; va_list ap2; diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h index 48796b6306fa..46be39a641c1 100644 --- a/tools/mtmd/clip-model.h +++ b/tools/mtmd/clip-model.h @@ -55,8 +55,7 @@ struct clip_hparams { int32_t n_head = 0; int32_t n_head_kv = 0; int32_t n_layer = 0; - // idefics3 - int32_t n_merge = 0; // number of patch merges **per-side** + int32_t n_merge = 1; // number of patch merges **per-side** // for preprocessor int32_t image_longest_edge = 0; @@ -91,7 +90,7 @@ struct clip_hparams { float eps = 1e-6; float rope_theta = 0.0; - std::vector vision_feature_layer; + std::vector feature_layers; int32_t attn_window_size = 0; int32_t n_wa_pattern = 0; std::unordered_set wa_layer_indexes; // explicit layer indexes that use full attention (for irregular patterns like YoutuVL) @@ -135,8 +134,7 @@ struct clip_hparams { int32_t custom_image_max_tokens = -1; void set_limit_image_tokens(int n_tokens_min, int n_tokens_max) { - const int cur_merge = n_merge == 0 ? 1 : n_merge; - const int patch_area = patch_size * patch_size * cur_merge * cur_merge; + const int patch_area = patch_size * patch_size * n_merge * n_merge; image_min_pixels = (custom_image_min_tokens > 0 ? custom_image_min_tokens : n_tokens_min) * patch_area; image_max_pixels = (custom_image_max_tokens > 0 ? custom_image_max_tokens : n_tokens_max) * patch_area; warmup_image_size = static_cast(std::sqrt(image_max_pixels)); @@ -145,8 +143,7 @@ struct clip_hparams { void set_warmup_n_tokens(int n_tokens) { int n_tok_per_side = static_cast(std::sqrt(n_tokens)); GGML_ASSERT(n_tok_per_side * n_tok_per_side == n_tokens && "n_tokens must be n*n"); - const int cur_merge = n_merge == 0 ? 1 : n_merge; - warmup_image_size = n_tok_per_side * patch_size * cur_merge; + warmup_image_size = n_tok_per_side * patch_size * n_merge; // TODO: support warmup size for custom token numbers } // sam vit deepseek-ocr @@ -165,8 +162,8 @@ struct clip_hparams { return false; } - bool is_vision_feature_layer(int32_t layer) const { - return std::find(vision_feature_layer.begin(), vision_feature_layer.end(), layer) != vision_feature_layer.end(); + bool is_feature_layer(int32_t layer) const { + return std::find(feature_layers.begin(), feature_layers.end(), layer) != feature_layers.end(); } }; diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 02e7a3a8f60f..d2226b3be1d7 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -171,6 +171,8 @@ struct clip_ctx { std::map mem_usage; std::map mem_compute; + bool support_batch = false; + clip_ctx(clip_context_params & ctx_params) { flash_attn_type = ctx_params.flash_attn_type; no_alloc = ctx_params.no_alloc; @@ -314,7 +316,7 @@ ggml_tensor * clip_graph::build_vit( std::function add_pos, const build_vit_opts & opts ) { - // batch dim: inp is [n_embd, n_pos] (B==1) or [n_embd, n_pos, B] (multi-tile encode) + // batch dim: inp is [n_embd, n_pos, B] const int64_t B = inp->ne[2]; if (learned_pos_embd) { @@ -532,7 +534,7 @@ ggml_tensor * clip_graph::build_vit( ggml_tensor * clip_graph::build_inp() { ggml_tensor * inp_raw = build_inp_raw(); ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); - inp = ggml_reshape_2d(ctx0, inp, n_patches, n_embd); + inp = ggml_reshape_3d(ctx0, inp, n_patches, n_embd, n_batch); inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp)); if (model.patch_bias) { inp = ggml_add(ctx0, inp, model.patch_bias); @@ -862,8 +864,8 @@ ggml_tensor * clip_graph::build_patch_merge_permute(ggml_tensor * cur, int scale return cur; } -static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs) { - const clip_image_f32 & img = *imgs.entries[0]; +static std::unique_ptr clip_get_graph_builder(clip_ctx * ctx, const clip_image_f32_batch & imgs) { + const clip_image_f32 & img = imgs.entries[0]; std::unique_ptr builder; switch (ctx->proj_type()) { @@ -1025,7 +1027,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 // TODO [QWEN_VIDEO]: improve this in the future builder->n_batch = imgs.entries.size(); - return builder->build(); + return builder; } // @@ -1043,8 +1045,17 @@ struct clip_model_loader { bool has_vision = false; bool has_audio = false; + mtmd_progress_callback progress_callback = nullptr; + void * progress_callback_user_data = nullptr; + // TODO @ngxson : we should not pass clip_ctx here, it should be clip_model - clip_model_loader(const char * fname, bool skip_tensors = false) : fname(fname) { + clip_model_loader(const char * fname, + bool skip_tensors = false, + mtmd_progress_callback progress_cb = nullptr, + void * progress_user_data = nullptr) + : fname(fname), + progress_callback(progress_cb), + progress_callback_user_data(progress_user_data) { struct ggml_context * meta = nullptr; struct gguf_init_params params = { @@ -1199,6 +1210,9 @@ struct clip_model_loader { { std::vector pinpoints; get_arr_int(KEY_IMAGE_GRID_PINPOINTS, pinpoints, false); + if (pinpoints.size() % 2 != 0) { + throw std::runtime_error(string_format("%s: image_grid_pinpoints must have an even number of elements, got %zu\n", __func__, pinpoints.size())); + } if (!pinpoints.empty()) { for (size_t i = 0; i < pinpoints.size(); i += 2) { hparams.image_res_candidates.push_back({ @@ -1241,24 +1255,23 @@ struct clip_model_loader { } if (is_vision) { - int idx_mean = gguf_find_key(ctx_gguf.get(), KEY_IMAGE_MEAN); - int idx_std = gguf_find_key(ctx_gguf.get(), KEY_IMAGE_STD); - GGML_ASSERT(idx_mean >= 0 && "image_mean not found"); - GGML_ASSERT(idx_std >= 0 && "image_std not found"); - const float * mean_data = (const float *) gguf_get_arr_data(ctx_gguf.get(), idx_mean); - const float * std_data = (const float *) gguf_get_arr_data(ctx_gguf.get(), idx_std); + std::vector image_mean; + std::vector image_std; + get_arr_f32(KEY_IMAGE_MEAN, image_mean, false); + get_arr_f32(KEY_IMAGE_STD , image_std, false); + if (image_mean.size() < 3 || image_std.size() < 3) { + throw std::runtime_error(string_format("%s: image_mean/image_std arrays must have at least 3 elements, got %zu and %zu\n", __func__, image_mean.size(), image_std.size())); + } for (int i = 0; i < 3; ++i) { - hparams.image_mean[i] = mean_data[i]; - hparams.image_std[i] = std_data[i]; + hparams.image_mean[i] = image_mean[i]; + hparams.image_std[i] = image_std[i]; } } - // Load the vision feature layer indices if they are explicitly provided; - // if multiple vision feature layers are present, the values will be concatenated - // to form the final visual features. + // Load the vision/audio feature layer indices if they are explicitly provided // NOTE: gguf conversions should standardize the values of the vision feature layer to // be non-negative, since we use -1 to mark values as unset here. - get_arr_int(KEY_FEATURE_LAYER, hparams.vision_feature_layer, false); + get_arr_int(string_format(KEY_FEATURE_LAYERS, prefix), hparams.feature_layers, false); // model-specific params switch (model.proj_type) { @@ -1640,6 +1653,7 @@ struct clip_model_loader { get_u32(KEY_A_PROJ_WINDOW_SIZE, hparams.audio_proj_window_size); get_u32(KEY_A_PROJ_DOWNSAMPLE_RATE, hparams.audio_proj_downsample_rate); get_u32(KEY_A_PROJ_HEAD_COUNT, hparams.audio_proj_head_count); + // NOTE: feature layers loaded above in common path } break; case PROJECTOR_TYPE_JANUS_PRO: { @@ -1652,11 +1666,11 @@ struct clip_model_loader { hparams.image_resize_algo = RESIZE_ALGO_BICUBIC_PILLOW; hparams.image_resize_pad = PAD_CEIL; - get_arr_int(KEY_FEATURE_LAYER, hparams.vision_feature_layer); + // NOTE: feature_layers loaded in common path as optional get_arr_int(KEY_PROJ_SPATIAL_OFFSETS, hparams.proj_spatial_offsets); - if (hparams.vision_feature_layer.size() != hparams.proj_spatial_offsets.size()) { - throw std::runtime_error(string_format("%s: vision_feature_layer.size() %d != proj_spatial_offsets.size() %d", - hparams.vision_feature_layer.size(), hparams.proj_spatial_offsets.size())); + if (hparams.feature_layers.size() != hparams.proj_spatial_offsets.size()) { + throw std::runtime_error(string_format("%s: feature_layers.size() %d != proj_spatial_offsets.size() %d", + hparams.feature_layers.size(), hparams.proj_spatial_offsets.size())); } get_u32(KEY_PROJ_SAMPLE_QUERY_SIDE, hparams.downsample_query_side); @@ -1673,8 +1687,11 @@ struct clip_model_loader { // note: some models having hparams.image_size == 0, which means the image size is dynamic throw std::runtime_error(string_format("%s: image_size (%d) cannot be negative\n", __func__, hparams.image_size)); } - if (hparams.patch_size <= 0) { - throw std::runtime_error(string_format("%s: patch_size (%d) must be greater than 0\n", __func__, hparams.patch_size)); + if (hparams.image_size > 65536) { + throw std::runtime_error(string_format("%s: image_size (%d) is too large (max 65536)\n", __func__, hparams.image_size)); + } + if (hparams.patch_size <= 0 || hparams.patch_size >= 65536) { + throw std::runtime_error(string_format("%s: patch_size (%d) must be positive and less than 65536\n", __func__, hparams.patch_size)); } if (hparams.n_embd <= 0) { throw std::runtime_error(string_format("%s: n_embd (%d) must be greater than 0\n", __func__, hparams.n_embd)); @@ -1682,6 +1699,9 @@ struct clip_model_loader { if (hparams.image_max_pixels < hparams.image_min_pixels) { throw std::runtime_error(string_format("%s: image_max_pixels (%d) is less than image_min_pixels (%d)\n", __func__, hparams.image_max_pixels, hparams.image_min_pixels)); } + if (hparams.n_merge < 0 || hparams.n_merge >= 65536) { + throw std::runtime_error(string_format("%s: n_merge (%d) must be greater than 0 and less than 65536\n", __func__, hparams.n_merge)); + } } LOG_INF("%s: projector: %s\n", __func__, proj_type.c_str()); @@ -1721,6 +1741,19 @@ struct clip_model_loader { LOG_INF("%s: audio_n_fft: %d\n", __func__, hparams.audio_n_fft); LOG_INF("%s: audio_window_len: %d\n", __func__, hparams.audio_window_len); LOG_INF("%s: audio_hop_len: %d\n", __func__, hparams.audio_hop_len); + + // GEMMA4UA is encoder-free: it uses n_mel_bins as a raw-waveform frame size (640) and has no FFT/filterbank, so the mel-range and FFT + // checks below do not apply to it. + const bool fft_based = model.proj_type != PROJECTOR_TYPE_GEMMA4UA; + + // Validate audio hparams loaded from GGUF metadata + if (hparams.n_mel_bins <= 0 || (fft_based && hparams.n_mel_bins > 256)) { + throw std::runtime_error(string_format("%s: n_mel_bins (%d) must be in range [1, 256]\n", __func__, hparams.n_mel_bins)); + } + if (fft_based && (hparams.audio_sample_rate <= 0 || hparams.audio_n_fft <= 0 || hparams.audio_hop_len <= 0 || hparams.audio_window_len <= 0)) { + throw std::runtime_error(string_format("%s: audio hparams invalid: sample_rate=%d n_fft=%d window_len=%d hop_len=%d\n", + __func__, hparams.audio_sample_rate, hparams.audio_n_fft, hparams.audio_window_len, hparams.audio_hop_len)); + } } LOG_INF("\n"); LOG_INF("%s: model size: %.2f MiB\n", __func__, model_size / 1024.0 / 1024.0); @@ -1734,7 +1767,7 @@ struct clip_model_loader { std::map tensor_offset; std::vector tensors_to_load; - auto fin = std::ifstream(fname, std::ios::binary); + auto fin = open_ifstream_binary(fname); if (!fin) { throw std::runtime_error(string_format("%s: failed to open %s\n", __func__, fname.c_str())); } @@ -2713,7 +2746,7 @@ struct clip_model_loader { model.image_newline = get_tensor(TN_IMAGE_NEWLINE); // Load separate layerwise and spatial projector tensors - const auto projector_count = hparams.vision_feature_layer.size(); + const auto projector_count = hparams.feature_layers.size(); model.qf_proj_blocks.resize(projector_count); for (size_t bid = 0; bid < projector_count; ++bid) { auto & b = model.qf_proj_blocks[bid]; @@ -2769,37 +2802,60 @@ struct clip_model_loader { } // load data - if (!ctx_clip.no_alloc) { + { std::vector read_buf; + // start loading event + if (progress_callback){ + progress_callback(0.0, progress_callback_user_data); + } + + // compute total tensor data size for progress reporting + size_t total_data_size = 0; + for (auto & t : tensors_to_load) { + total_data_size += ggml_nbytes(t); + } + // alloc memory and offload data ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(ctx_clip.backend); ctx_clip.buf.reset(ggml_backend_alloc_ctx_tensors_from_buft(ctx_clip.ctx_data.get(), buft)); ggml_backend_buffer_set_usage(ctx_clip.buf.get(), GGML_BACKEND_BUFFER_USAGE_WEIGHTS); - for (auto & t : tensors_to_load) { - ggml_tensor * cur = ggml_get_tensor(ctx_clip.ctx_data.get(), t->name); - GGML_ASSERT(cur && "tensor not found in ctx_data"); - auto it_off = tensor_offset.find(t->name); - GGML_ASSERT(it_off != tensor_offset.end() && "no offset for tensor"); - const size_t offset = it_off->second; - fin.seekg(offset, std::ios::beg); - if (!fin) { - throw std::runtime_error(string_format("%s: failed to seek for tensor %s\n", __func__, t->name)); - } - size_t num_bytes = ggml_nbytes(cur); - if (ggml_backend_buft_is_host(buft)) { - // for the CPU and Metal backend, we can read directly into the tensor - fin.read(reinterpret_cast(cur->data), num_bytes); - } else { - // read into a temporary buffer first, then copy to device memory - read_buf.resize(num_bytes); - fin.read(reinterpret_cast(read_buf.data()), num_bytes); - ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes); + // read the weight from file + if (!ctx_clip.no_alloc) { + size_t data_loaded = 0; + for (auto & t : tensors_to_load) { + ggml_tensor * cur = ggml_get_tensor(ctx_clip.ctx_data.get(), t->name); + GGML_ASSERT(cur && "tensor not found in ctx_data"); + auto it_off = tensor_offset.find(t->name); + GGML_ASSERT(it_off != tensor_offset.end() && "no offset for tensor"); + const size_t offset = it_off->second; + fin.seekg(offset, std::ios::beg); + if (!fin) { + throw std::runtime_error(string_format("%s: failed to seek for tensor %s\n", __func__, t->name)); + } + size_t num_bytes = ggml_nbytes(cur); + if (ggml_backend_buft_is_host(buft)) { + // for the CPU and Metal backend, we can read directly into the tensor + fin.read(reinterpret_cast(cur->data), num_bytes); + } else { + // read into a temporary buffer first, then copy to device memory + read_buf.resize(num_bytes); + fin.read(reinterpret_cast(read_buf.data()), num_bytes); + ggml_backend_tensor_set(cur, read_buf.data(), 0, num_bytes); + } + data_loaded += num_bytes; + if (progress_callback && total_data_size > 0) { + const float progress = (float)data_loaded / (float)total_data_size; + if (!progress_callback(progress, progress_callback_user_data)) { + throw std::runtime_error(string_format("%s: model loading cancelled by progress_callback\n", __func__)); + } + } } + LOG_DBG("%s: loaded %zu tensors from %s\n", __func__, tensors_to_load.size(), fname.c_str()); + } else { + LOG_DBG("%s: no_alloc is set, skipping tensor data loading (%zu tensors)\n", __func__, tensors_to_load.size()); } fin.close(); - - LOG_DBG("%s: loaded %zu tensors from %s\n", __func__, tensors_to_load.size(), fname.c_str()); } } @@ -2819,20 +2875,40 @@ struct clip_model_loader { std::vector ops; }; - static void warmup(clip_ctx & ctx_clip) { + static clip_image_f32_batch get_dummy_batch(clip_ctx & ctx_clip) { // create a fake batch const auto & hparams = ctx_clip.model.hparams; clip_image_f32_batch batch; - clip_image_f32_ptr img(clip_image_f32_init()); + clip_image_f32 img; if (ctx_clip.model.modality == CLIP_MODALITY_VISION) { const int sz = hparams.warmup_image_size; - img->set_size({sz, sz}, false, false); + img.set_size({sz, sz}, false, false); LOG_INF("%s: warmup with image size = %d x %d\n", __func__, sz, sz); } else { - img->set_size({hparams.warmup_audio_size, hparams.n_mel_bins}, false, false); + // GEMMA4UA uses n_mel_bins as a raw-waveform frame size (640), not a mel-bin count, + // so the [1, 256] bound only applies to FFT-based models. + const bool fft_based = ctx_clip.model.proj_type != PROJECTOR_TYPE_GEMMA4UA; + if (hparams.n_mel_bins <= 0 || (fft_based && hparams.n_mel_bins > 256)) { + throw std::runtime_error(string_format("%s: invalid n_mel_bins (%d), must be in [1, 256]\n", __func__, hparams.n_mel_bins)); + } + img.set_size({hparams.warmup_audio_size, hparams.n_mel_bins}, false, false); LOG_INF("%s: warmup with audio size = %d\n", __func__, hparams.warmup_audio_size); } - batch.entries.push_back(std::move(img)); + batch.entries.push_back(img); + return batch; + } + + static void init_ctx(clip_ctx & ctx_clip) { + ctx_clip.buf_compute_meta.resize(ctx_clip.max_nodes * ggml_tensor_overhead() + ggml_graph_overhead()); + + // check batching support + auto batch = get_dummy_batch(ctx_clip); + auto builder = clip_get_graph_builder(&ctx_clip, batch); + ctx_clip.support_batch = builder->support_batch(); + } + + static void warmup(clip_ctx & ctx_clip) { + auto batch = get_dummy_batch(ctx_clip); warmup(ctx_clip, batch); } @@ -2905,9 +2981,7 @@ struct clip_model_loader { // only initialize backend buffers, but do not allocate them yet static support_info_graph reserve_compute_meta(clip_ctx & ctx_clip, const clip_image_f32_batch & batch) { - ctx_clip.buf_compute_meta.resize(ctx_clip.max_nodes * ggml_tensor_overhead() + ggml_graph_overhead()); - - ggml_cgraph * gf = clip_image_build_graph(&ctx_clip, batch); + ggml_cgraph * gf = clip_get_graph_builder(&ctx_clip, batch)->build(); ggml_backend_sched_reserve(ctx_clip.sched.get(), gf); ctx_clip.mem_compute.clear(); @@ -2980,7 +3054,13 @@ struct clip_model_loader { } return; } - output = gguf_get_val_u32(ctx_gguf.get(), i); + const uint32_t val = gguf_get_val_u32(ctx_gguf.get(), i); + // sanity check + if (val > (uint32_t) INT32_MAX) { + throw std::runtime_error(string_format("%s: value %u for key '%s' exceeds INT32_MAX\n", + __func__, val, key.c_str())); + } + output = (int) val; } void get_f32(const std::string & key, float & output, bool required = true) const { @@ -2994,6 +3074,29 @@ struct clip_model_loader { output = gguf_get_val_f32(ctx_gguf.get(), i); } + void get_arr_f32(const std::string & key, std::vector & output, bool required = true) const { + const int i = gguf_find_key(ctx_gguf.get(), key.c_str()); + if (i < 0) { + if (required) { + throw std::runtime_error("Key not found: " + key); + } + return; + } + const auto type = gguf_get_arr_type(ctx_gguf.get(), i); + if (type != GGUF_TYPE_FLOAT32) { + throw std::runtime_error(string_format("%s: array '%s' has type %d, expected %d (GGUF_TYPE_FLOAT32)\n", __func__, key.c_str(), type, GGUF_TYPE_FLOAT32)); + } + const size_t n = gguf_get_arr_n(ctx_gguf.get(), i); + if (n > (size_t) std::numeric_limits::max()) { + throw std::runtime_error(string_format("%s: array '%s' is too large (%zu elements)\n", __func__, key.c_str(), n)); + } + output.resize(n); + const float * values = (const float *)gguf_get_arr_data(ctx_gguf.get(), i); + for (size_t j = 0; j < n; ++j) { + output[j] = values[j]; + } + } + void get_string(const std::string & key, std::string & output, bool required = true) const { const int i = gguf_find_key(ctx_gguf.get(), key.c_str()); if (i < 0) { @@ -3013,11 +3116,18 @@ struct clip_model_loader { } return; } - int n = gguf_get_arr_n(ctx_gguf.get(), i); + const auto type = gguf_get_arr_type(ctx_gguf.get(), i); + if (type != GGUF_TYPE_INT32) { + throw std::runtime_error(string_format("%s: array '%s' has type %d, expected %d (GGUF_TYPE_INT32)\n", __func__, key.c_str(), type, GGUF_TYPE_INT32)); + } + const size_t n = gguf_get_arr_n(ctx_gguf.get(), i); + if (n > (size_t) std::numeric_limits::max()) { + throw std::runtime_error(string_format("%s: array '%s' is too large (%zu elements)\n", __func__, key.c_str(), n)); + } output.resize(n); const int32_t * values = (const int32_t *)gguf_get_arr_data(ctx_gguf.get(), i); - for (int i = 0; i < n; ++i) { - output[i] = values[i]; + for (size_t j = 0; j < n; ++j) { + output[j] = values[j]; } } @@ -3063,13 +3173,17 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params clip_ctx * ctx_audio = nullptr; try { - clip_model_loader loader(fname); + clip_model_loader loader(fname, + /* skip_tensors */ false, + ctx_params.progress_callback, + ctx_params.progress_callback_user_data); bool skip_audio = false; if (loader.has_vision) { ctx_vision = new clip_ctx(ctx_params); loader.load_hparams(ctx_vision->model, CLIP_MODALITY_VISION); loader.load_tensors(*ctx_vision); + loader.init_ctx(*ctx_vision); if (ctx_params.warmup) { loader.warmup(*ctx_vision); } @@ -3083,6 +3197,7 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params ctx_audio = new clip_ctx(ctx_params); loader.load_hparams(ctx_audio->model, CLIP_MODALITY_AUDIO); loader.load_tensors(*ctx_audio); + loader.init_ctx(*ctx_audio); if (ctx_params.warmup) { loader.warmup(*ctx_audio); } @@ -3108,64 +3223,6 @@ struct clip_cap clip_get_cap(const char * fname) { return res; } -struct clip_image_size * clip_image_size_init() { - struct clip_image_size * load_image_size = new struct clip_image_size(); - load_image_size->width = 448; - load_image_size->height = 448; - return load_image_size; -} - -struct clip_image_u8 * clip_image_u8_init() { - return new clip_image_u8(); -} - -struct clip_image_f32 * clip_image_f32_init() { - return new clip_image_f32(); -} - -struct clip_image_f32_batch * clip_image_f32_batch_init() { - return new clip_image_f32_batch(); -} - -void clip_image_size_free(struct clip_image_size * load_image_size) { - if (load_image_size == nullptr) { - return; - } - delete load_image_size; -} -void clip_image_u8_free(struct clip_image_u8 * img) { delete img; } -void clip_image_f32_free(struct clip_image_f32 * img) { delete img; } -void clip_image_u8_batch_free(struct clip_image_u8_batch * batch) { delete batch; } -void clip_image_f32_batch_free(struct clip_image_f32_batch * batch) { delete batch; } - -size_t clip_image_f32_batch_n_images(const struct clip_image_f32_batch * batch) { - return batch->entries.size(); -} - -size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int idx) { - if (idx < 0 || idx >= (int)batch->entries.size()) { - LOG_ERR("%s: invalid index %d\n", __func__, idx); - return 0; - } - return batch->entries[idx]->nx(); -} - -size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx) { - if (idx < 0 || idx >= (int)batch->entries.size()) { - LOG_ERR("%s: invalid index %d\n", __func__, idx); - return 0; - } - return batch->entries[idx]->ny(); -} - -clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx) { - if (idx < 0 || idx >= (int)batch->entries.size()) { - LOG_ERR("%s: invalid index %d\n", __func__, idx); - return nullptr; - } - return batch->entries[idx].get(); -} - void clip_free(clip_ctx * ctx) { if (ctx == nullptr) { return; @@ -3173,23 +3230,11 @@ void clip_free(clip_ctx * ctx) { delete ctx; } -int32_t clip_get_image_size(const struct clip_ctx * ctx) { - return ctx->model.hparams.image_size; -} - -int32_t clip_get_patch_size(const struct clip_ctx * ctx) { - return ctx->model.hparams.patch_size; -} - -int32_t clip_get_hidden_size(const struct clip_ctx * ctx) { - return ctx->model.hparams.n_embd; -} - const char * clip_patch_merge_type(const struct clip_ctx * ctx) { return ctx->model.hparams.mm_patch_merge_type == PATCH_MERGE_SPATIAL_UNPAD ? "spatial_unpad" : "flat"; } -int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img) { +int clip_n_output_tokens_x(const clip_ctx * ctx, const clip_image_f32 * img) { const auto & params = ctx->model.hparams; const int n_total = clip_n_output_tokens(ctx, img); const auto & proj = ctx->proj_type(); @@ -3212,7 +3257,7 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * return n_total; } -int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img) { +int clip_n_output_tokens_y(const clip_ctx * ctx, const clip_image_f32 * img) { const auto & params = ctx->model.hparams; const auto & proj = ctx->proj_type(); switch (proj) { @@ -3234,7 +3279,7 @@ int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * return 1; } -int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img) { +int clip_n_output_tokens(const clip_ctx * ctx, const clip_image_f32 * img) { const auto & params = ctx->model.hparams; // for models with fixed size image, the input image is already pre-processed and resized to square @@ -3356,8 +3401,8 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im { // dynamic size int n_merge = ctx->model.hparams.n_merge; - int n_patches_x = img->nx() / patch_size / (n_merge > 0 ? n_merge : 1); - int n_patches_y = img->ny() / patch_size / (n_merge > 0 ? n_merge : 1); + int n_patches_x = img->nx() / patch_size / n_merge; + int n_patches_y = img->ny() / patch_size / n_merge; if (ctx->model.token_embd_img_break) { n_patches = n_patches_y * n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row } else { @@ -3484,25 +3529,21 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im return n_patches; } -bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) { +bool clip_image_encode(struct clip_ctx * ctx, int n_threads, const clip_image_f32 * img, std::vector & out_vec) { clip_image_f32_batch imgs; - clip_image_f32_ptr img_copy(clip_image_f32_init()); - *img_copy = *img; + clip_image_f32 img_copy = *img; imgs.entries.push_back(std::move(img_copy)); - return clip_image_batch_encode(ctx, n_threads, &imgs, vec); + return clip_image_batch_encode(ctx, n_threads, &imgs, out_vec); } -bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec) { +bool clip_image_batch_encode(clip_ctx * ctx, int n_threads, const clip_image_f32_batch * imgs_c_ptr, std::vector & out_batch_embd) { const clip_image_f32_batch & imgs = *imgs_c_ptr; int n_batch_cur = imgs.entries.size(); - // maximum supported batch size, usually == 2 for qwen-vl-based models - int n_batch_max = clip_model_n_batch_max(ctx); - - // TODO @ngxson : implement batch size > 1 as a loop - // we don't need true batching support because the cgraph will gonna be big anyway - if (n_batch_cur > n_batch_max) { + // [QWEN_VIDEO] for video models, the batch dimension is used as temporal dimension for merged frames + if (!ctx->support_batch && n_batch_cur > clip_model_n_temporal_merge(ctx)) { + LOG_ERR("%s: batch size %d exceeds maximum supported batch/temporal-merge size %d\n", __func__, n_batch_cur, clip_model_n_temporal_merge(ctx)); return false; } @@ -3513,15 +3554,15 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima // build the inference graph ggml_backend_sched_reset(ctx->sched.get()); - ggml_cgraph * gf = clip_image_build_graph(ctx, imgs); + ggml_cgraph * gf = clip_get_graph_builder(ctx, imgs)->build(); ggml_backend_sched_alloc_graph(ctx->sched.get(), gf); // set inputs const auto & model = ctx->model; const auto & hparams = model.hparams; - const int image_size_width = imgs.entries[0]->nx(); - const int image_size_height = imgs.entries[0]->ny(); + const int image_size_width = imgs.entries[0].nx(); + const int image_size_height = imgs.entries[0].ny(); const int patch_size = hparams.patch_size; const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size)); @@ -3559,7 +3600,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima if (!imgs.is_audio) { size_t nelem = 0; for (const auto & img : imgs.entries) { - nelem += img->nx() * img->ny() * 3; + nelem += img.nx() * img.ny() * 3; } std::vector inp_raw(nelem); @@ -3577,12 +3618,13 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima // IMPORTANT: [QWEN_VIDEO] the batch dim is currently used for temporal dim in Qwen-VL models // All entries must have the same spatial size (enforced by can_batch_with() during merging) { - const int nx = imgs.entries[0]->nx(); - const int ny = imgs.entries[0]->ny(); + const int nx = imgs.entries[0].nx(); + const int ny = imgs.entries[0].ny(); const int n = nx * ny; for (int b = 0; b < n_batch_cur; b++) { - const auto & buf = imgs.entries[b]->get_ro_buf(); + LOG_DBG("%s: copying image %d/%d to input buffer (nx=%d, ny=%d)\n", __func__, b+1, n_batch_cur, nx, ny); + const auto & buf = imgs.entries[b].get_ro_buf(); float * batch_entry = inp_raw.data() + b * (3*n); for (int y = 0; y < ny; y++) { for (int x = 0; x < nx; x++) { @@ -3602,9 +3644,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima GGML_ASSERT(imgs.entries.size() == 1); const auto & mel_inp = imgs.entries[0]; - const auto & buf = mel_inp->get_ro_buf(); - const int n_step = mel_inp->nx(); - const int n_mel = mel_inp->ny(); + const auto & buf = mel_inp.get_ro_buf(); + const int n_step = mel_inp.nx(); + const int n_mel = mel_inp.ny(); GGML_ASSERT((size_t)n_step * n_mel == buf.size()); set_input_f32("inp_raw", buf); @@ -4218,7 +4260,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima GGML_ASSERT(imgs.entries.size() == 1); const auto & img0 = imgs.entries.front(); // Compute n_pos matching SSCP output: two stride-2 convs - int n_pos = img0->nx(); + int n_pos = img0.nx(); for (int i = 0; i < 2; i++) { n_pos = (n_pos - 1) / 2 + 1; } // Chunked local attention: blocked causal mask and RPE @@ -4266,7 +4308,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima case PROJECTOR_TYPE_LFM2A: { GGML_ASSERT(imgs.entries.size() == 1); - const auto n_frames = clip_n_output_tokens(ctx, imgs.entries.front().get()); + const auto n_frames = clip_n_output_tokens(ctx, &imgs.entries.front()); auto d_model = 512; auto seq_len = n_frames * 2 - 1; @@ -4324,7 +4366,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima // reshapes as ggml_get_rows gathers. The names are set // by g4v_gather() in models/granite4-vision.cpp. const int patch_size = model.hparams.patch_size; - const int image_side = imgs.entries.front()->nx() / patch_size; + const int image_side = imgs.entries.front().nx() / patch_size; const int window_side = hparams.downsample_window_side; const int query_side = hparams.downsample_query_side; const int n = image_side / window_side; @@ -4382,7 +4424,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima // Stage 1b only uses block 0's permutations; future stages // will upload all blocks. - for (size_t bid = 0; bid < hparams.vision_feature_layer.size(); ++bid) { + for (size_t bid = 0; bid < hparams.feature_layers.size(); ++bid) { const std::string prefix = "g4v_blk" + std::to_string(bid) + "_"; upload(prefix + "win_idx", make_win_idx(image_side, window_side)); upload(prefix + "qwin_idx", make_win_idx(new_side, query_side)); @@ -4416,24 +4458,34 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima // the last node is the embedding tensor ggml_tensor * embeddings = ggml_graph_node(gf, -1); - // sanity check (only support batch size of 1 for now) + // sanity check (assuming that all images in batch have the same number of tokens, so we only check the first one) const int n_tokens_out = embeddings->ne[1]; - const int expected_n_tokens_out = clip_n_output_tokens(ctx, imgs.entries[0].get()); + const int expected_n_tokens_out = clip_n_output_tokens(ctx, &imgs.entries[0]); if (n_tokens_out != expected_n_tokens_out) { LOG_ERR("%s: expected output %d tokens, got %d\n", __func__, expected_n_tokens_out, n_tokens_out); GGML_ABORT("Invalid number of output tokens"); } - // copy the embeddings to the location passed by the user - if (vec != nullptr) { - ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings)); + LOG_DBG("%s: output embedding shape [%d, %d, %d]\n", __func__, + (int)embeddings->ne[0], (int)embeddings->ne[1], (int)embeddings->ne[2]); + + // copy output to user buffer if provided + // if output is empty, skip the copy + if (!out_batch_embd.empty()) { + if (out_batch_embd.size() != (size_t)ggml_nelements(embeddings)) { + LOG_ERR("%s: output buffer has %zu elements but expected %zu\n", __func__, out_batch_embd.size(), (size_t)ggml_nelements(embeddings)); + GGML_ABORT("Output buffer size mismatch"); + } + ggml_backend_tensor_get(embeddings, out_batch_embd.data(), 0, ggml_nbytes(embeddings)); + } else { + LOG_WRN("%s: output buffer is empty, skipping copy\n", __func__); } // Debug: dump final embeddings if MTMD_DEBUG_EMBEDDINGS is set if (ctx->debug_output_embeddings) { const int64_t n_embd = embeddings->ne[0]; const int64_t n_tokens = embeddings->ne[1]; - std::vector emb_data(n_embd * n_tokens); + std::vector emb_data(ggml_nelements(embeddings)); ggml_backend_tensor_get(embeddings, emb_data.data(), 0, ggml_nbytes(embeddings)); LOG_INF("\n=== MTMD_DEBUG_EMBEDDINGS ===\n"); @@ -4570,7 +4622,14 @@ bool clip_has_audio_encoder(const struct clip_ctx * ctx) { return ctx->model.modality == CLIP_MODALITY_AUDIO; } -int clip_model_n_batch_max(const struct clip_ctx * ctx) { +bool clip_support_batch(const struct clip_ctx * ctx) { + return ctx->support_batch; +} + +// TODO @ngxson : this is no longer correct with mtmd_batch API +// this was only meant to be used by qwen-vl-based models, to fuse 2 input images into one (qwen-vl video support) +// this logic should be refactored in near future to distinctly handle "merge frames" and "batching" +int clip_model_n_temporal_merge(const struct clip_ctx * ctx) { switch (ctx->proj_type()) { case PROJECTOR_TYPE_QWEN2VL: case PROJECTOR_TYPE_QWEN25VL: diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h index 18c7a1d1a7c4..967093a812d6 100644 --- a/tools/mtmd/clip.h +++ b/tools/mtmd/clip.h @@ -24,12 +24,14 @@ struct clip_image_size { return !(*this == other); } int area() const { + // avoid overflow when computing area + GGML_ASSERT(width >= 0 && width <= 46000); + GGML_ASSERT(height >= 0 && height <= 46000); return width * height; } }; struct clip_image_f32; -struct clip_image_u8_batch; struct clip_image_f32_batch; enum clip_modality { @@ -52,6 +54,8 @@ struct clip_context_params { ggml_backend_sched_eval_callback cb_eval; void * cb_eval_user_data; bool no_alloc; + mtmd_progress_callback progress_callback; + void * progress_callback_user_data; }; struct clip_init_result { @@ -63,42 +67,22 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params void clip_free(struct clip_ctx * ctx); -int32_t clip_get_image_size (const struct clip_ctx * ctx); -int32_t clip_get_patch_size (const struct clip_ctx * ctx); -int32_t clip_get_hidden_size(const struct clip_ctx * ctx); - // TODO: should be enum, not string const char * clip_patch_merge_type(const struct clip_ctx * ctx); -int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img); +int clip_n_output_tokens(const clip_ctx * ctx, const clip_image_f32 * img); // for M-RoPE, this will be the number of token positions in X and Y directions // for other models, X will be the total number of tokens and Y will be 1 -int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img); -int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img); +int clip_n_output_tokens_x(const clip_ctx * ctx, const clip_image_f32 * img); +int clip_n_output_tokens_y(const clip_ctx * ctx, const clip_image_f32 * img); // this should be equal to the embedding dimension of the text model int clip_n_mmproj_embd(const struct clip_ctx * ctx); -struct clip_image_size * clip_image_size_init(void); -struct clip_image_u8 * clip_image_u8_init (void); -struct clip_image_f32 * clip_image_f32_init(void); -struct clip_image_f32_batch * clip_image_f32_batch_init(void); // only used by libllava - -void clip_image_size_free (struct clip_image_size * img_size); -void clip_image_u8_free (struct clip_image_u8 * img); -void clip_image_f32_free(struct clip_image_f32 * img); -void clip_image_u8_batch_free (struct clip_image_u8_batch * batch); -void clip_image_f32_batch_free(struct clip_image_f32_batch * batch); - -// use for accessing underlay data of clip_image_f32_batch -size_t clip_image_f32_batch_n_images(const struct clip_image_f32_batch * batch); // equivalent to batch->size() -size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->nx -size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->ny -struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data - -bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec); -bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec); +// TODO: remove clip_image_encode() and always use batched version +bool clip_image_encode (struct clip_ctx * ctx, int n_threads, const clip_image_f32 * img, std::vector & out_vec); +bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, std::vector & out_batch_embd); bool clip_is_llava(const struct clip_ctx * ctx); // note for contributor: this clip_is_(model) pattern is deprecated @@ -107,7 +91,9 @@ bool clip_is_llava(const struct clip_ctx * ctx); bool clip_has_vision_encoder(const struct clip_ctx * ctx); bool clip_has_audio_encoder(const struct clip_ctx * ctx); -int clip_model_n_batch_max(const struct clip_ctx * ctx); +bool clip_support_batch(const struct clip_ctx * ctx); + +int clip_model_n_temporal_merge(const struct clip_ctx * ctx); // TODO @ngxson : remove, refactor this std::map clip_get_mem_usage(const struct clip_ctx * ctx); diff --git a/tools/mtmd/models/gemma4v.cpp b/tools/mtmd/models/gemma4v.cpp index 3570d6da1351..87cbd43fc5fd 100644 --- a/tools/mtmd/models/gemma4v.cpp +++ b/tools/mtmd/models/gemma4v.cpp @@ -10,7 +10,7 @@ ggml_cgraph * clip_graph_gemma4v::build() { ggml_set_name(inp_raw, "inp_raw_scaled"); ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); - inp = ggml_reshape_2d(ctx0, inp, n_patches, n_embd); + inp = ggml_reshape_3d(ctx0, inp, n_patches, n_embd, n_batch); inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp)); ggml_set_name(inp, "inp"); // note: no patch bias @@ -51,10 +51,11 @@ ggml_cgraph * clip_graph_gemma4v::build() { // first half ggml_tensor * first; { - first = ggml_view_3d(ctx0, cur, - n_dim/2, n_head, n_pos, + first = ggml_view_4d(ctx0, cur, + n_dim/2, n_head, n_pos, n_batch, cur->nb[1], cur->nb[2], + cur->nb[3], 0); first = ggml_rope_ext( ctx0, @@ -70,10 +71,11 @@ ggml_cgraph * clip_graph_gemma4v::build() { // second half ggml_tensor * second; { - second = ggml_view_3d(ctx0, cur, - n_dim/2, n_head, n_pos, + second = ggml_view_4d(ctx0, cur, + n_dim/2, n_head, n_pos, n_batch, cur->nb[1], cur->nb[2], + cur->nb[3], n_dim/2 * ggml_element_size(cur)); second = ggml_rope_ext( ctx0, @@ -103,14 +105,14 @@ ggml_cgraph * clip_graph_gemma4v::build() { const int kernel_size = hparams.n_merge; GGML_ASSERT(kernel_size > 0); - // [n_embd, n_patches] -> [n_patches_x, n_patches_y, n_embd, 1] - cur = ggml_cont_4d(ctx0, ggml_transpose(ctx0, cur), n_patches_x, n_patches_y, n_embd, 1); + // [n_embd, n_patches] -> [n_patches_x, n_patches_y, n_embd, n_batch] + cur = ggml_cont_4d(ctx0, ggml_transpose(ctx0, cur), n_patches_x, n_patches_y, n_embd, n_batch); cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, kernel_size, kernel_size, kernel_size, kernel_size, 0, 0); const int out_x = n_patches_x / kernel_size; const int out_y = n_patches_y / kernel_size; - // [out_x, out_y, n_embd, 1] -> [n_embd, out_x * out_y] - cur = ggml_reshape_3d(ctx0, cur, out_x * out_y, n_embd, 1); + // [out_x, out_y, n_embd, n_batch] -> [n_embd, out_x * out_y, n_batch] + cur = ggml_reshape_3d(ctx0, cur, out_x * out_y, n_embd, n_batch); cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); cur = ggml_scale(ctx0, cur, sqrtf((float)n_embd)); cb(cur, "pooled", -1); diff --git a/tools/mtmd/models/granite-speech.cpp b/tools/mtmd/models/granite-speech.cpp index 0bd4d75ac513..a158a59ce9ac 100644 --- a/tools/mtmd/models/granite-speech.cpp +++ b/tools/mtmd/models/granite-speech.cpp @@ -1,5 +1,7 @@ #include "models.h" +#include + ggml_cgraph * clip_graph_granite_speech::build() { const int n_frames = img.nx(); const int context_size = hparams.audio_chunk_size; @@ -11,6 +13,10 @@ ggml_cgraph * clip_graph_granite_speech::build() { const int padded_len = num_blocks * context_size; const int remainder = n_frames % context_size; + // Calculate projector input dimension based on feature layers + const int proj_input_dim = n_embd * (hparams.feature_layers.size() + 1); + const bool use_feature_concat = !hparams.feature_layers.empty(); + ggml_tensor * attn_dists = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, context_size * context_size); ggml_set_name(attn_dists, "attn_dists"); ggml_set_input(attn_dists); @@ -31,6 +37,15 @@ ggml_cgraph * clip_graph_granite_speech::build() { cur = ggml_add(ctx0, cur, model.inp_proj_b); cb(cur, "inp_linear", -1); + // Capture layer 0 if requested (after input_linear) + ggml_tensor * concat_result = nullptr; + if (use_feature_concat) { + if (std::find(hparams.feature_layers.begin(), hparams.feature_layers.end(), 0) != hparams.feature_layers.end()) { + concat_result = cur; + cb(concat_result, "feature_layer_0", -1); + } + } + for (int il = 0; il < n_layer; il++) { const auto & layer = model.layers[il]; auto * residual = cur; @@ -168,6 +183,18 @@ ggml_cgraph * clip_graph_granite_speech::build() { NORM_TYPE_NORMAL, eps, il); cb(cur, "layer_out", il); + // Capture intermediate layer (il + 1) if requested + if (use_feature_concat) { + if (hparams.is_feature_layer(il + 1)) { + if (concat_result == nullptr) { + concat_result = cur; + } else { + concat_result = ggml_concat(ctx0, concat_result, cur, 0); + } + cb(concat_result, string_format("feature_layer_%d", il + 1).c_str(), il); + } + } + // CTC branch if (il + 1 == ctc_layer) { auto * mid = build_mm(model.ctc_out_w, cur); @@ -180,6 +207,13 @@ ggml_cgraph * clip_graph_granite_speech::build() { } } + // Append final output to concatenated features if using feature concatenation + if (use_feature_concat && concat_result != nullptr) { + concat_result = ggml_concat(ctx0, concat_result, cur, 0); + cb(concat_result, "concat_final", -1); + cur = concat_result; + } + cb(cur, "encoder_out", -1); // QFormer projector @@ -197,7 +231,7 @@ ggml_cgraph * clip_graph_granite_speech::build() { cur = ggml_pad(ctx0, cur, 0, padded_proj - n_frames, 0, 0); } - ggml_tensor * enc_windows = ggml_reshape_3d(ctx0, cur, n_embd, window_size, nblocks_proj); + ggml_tensor * enc_windows = ggml_reshape_3d(ctx0, cur, proj_input_dim, window_size, nblocks_proj); ggml_tensor * queries = build_norm(model.qf_proj_blocks[0].qf_proj_query, model.qf_proj_blocks[0].qf_proj_norm_w, model.qf_proj_blocks[0].qf_proj_norm_b, diff --git a/tools/mtmd/models/granite4-vision.cpp b/tools/mtmd/models/granite4-vision.cpp index 9adb6f0fdbfa..1b252543c016 100644 --- a/tools/mtmd/models/granite4-vision.cpp +++ b/tools/mtmd/models/granite4-vision.cpp @@ -304,14 +304,14 @@ ggml_cgraph * clip_graph_granite4_vision::build() { } // --- Stage 1b/1c: WindowQFormer blocks --- - const int projector_count = hparams.vision_feature_layer.size(); + const int projector_count = hparams.feature_layers.size(); const float qformer_eps = 1e-12f; ggml_tensor * mmproj = nullptr; for (int bid = 0; bid < projector_count; ++bid) { const auto & blk = model.qf_proj_blocks[bid]; - int vlayer = hparams.vision_feature_layer[bid]; + int vlayer = hparams.feature_layers[bid]; GGML_ASSERT(vlayer >= 0 && vlayer < n_layer); ggml_tensor * h = layer_outs[vlayer]; diff --git a/tools/mtmd/models/internvl.cpp b/tools/mtmd/models/internvl.cpp index 9aded3b97cf7..65d7d5a6b7ba 100644 --- a/tools/mtmd/models/internvl.cpp +++ b/tools/mtmd/models/internvl.cpp @@ -8,7 +8,9 @@ ggml_cgraph * clip_graph_internvl::build() { ggml_tensor * inp = build_inp(); // add CLS token - inp = ggml_concat(ctx0, inp, model.class_embedding, 1); + ggml_tensor * cls_repeated = ggml_repeat_4d(ctx0, model.class_embedding, + model.class_embedding->ne[0], 1, n_batch, 1); + inp = ggml_concat(ctx0, inp, cls_repeated, 1); // The larger models use a different ViT, which uses RMS norm instead of layer norm // ref: https://github.com/ggml-org/llama.cpp/pull/13443#issuecomment-2869786188 @@ -24,14 +26,15 @@ ggml_cgraph * clip_graph_internvl::build() { nullptr); // remove CLS token - cur = ggml_view_2d(ctx0, cur, - n_embd, n_patches, - ggml_row_size(cur->type, n_embd), 0); + cur = ggml_view_3d(ctx0, cur, + n_embd, n_patches, n_batch, + cur->nb[1], cur->nb[2], 0); + cur = ggml_cont(ctx0, cur); // pixel shuffle { const int scale_factor = model.hparams.n_merge; - const int bsz = 1; // batch size, always 1 for now since we don't support batching + const int bsz = n_batch; const int height = n_patches_y; const int width = n_patches_x; GGML_ASSERT(scale_factor > 0); @@ -44,9 +47,10 @@ ggml_cgraph * clip_graph_internvl::build() { bsz); cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); // flatten to 2D - cur = ggml_cont_2d(ctx0, cur, + cur = ggml_cont_3d(ctx0, cur, n_embd * scale_factor * scale_factor, - cur->ne[1] * cur->ne[2]); + cur->ne[1] * cur->ne[2], + cur->ne[3]); } // projector (always using GELU activation) diff --git a/tools/mtmd/models/llava.cpp b/tools/mtmd/models/llava.cpp index 5aa3d2f0fac1..47efe68bd835 100644 --- a/tools/mtmd/models/llava.cpp +++ b/tools/mtmd/models/llava.cpp @@ -21,7 +21,7 @@ ggml_cgraph * clip_graph_llava::build() { // If we set explicit vision feature layers, only go up to the deepest one // NOTE: only used by granite-vision models for now - for (const auto & feature_layer : hparams.vision_feature_layer) { + for (const auto & feature_layer : hparams.feature_layers) { if (feature_layer > deepest_feature_layer) { deepest_feature_layer = feature_layer; } @@ -59,7 +59,7 @@ ggml_cgraph * clip_graph_llava::build() { // If this is an embedding feature layer, save the output. // NOTE: 0 index here refers to the input to the encoder. - if (hparams.is_vision_feature_layer(il)) { + if (hparams.is_feature_layer(il)) { embedding_stack.push_back(cur); } @@ -134,7 +134,7 @@ ggml_cgraph * clip_graph_llava::build() { // process vision feature layers (used by granite) { // final layer is a vision feature layer - if (hparams.is_vision_feature_layer(max_feature_layer)) { + if (hparams.is_feature_layer(max_feature_layer)) { embedding_stack.push_back(inpL); } diff --git a/tools/mtmd/models/models.h b/tools/mtmd/models/models.h index 12082a5280a8..12d5e6949320 100644 --- a/tools/mtmd/models/models.h +++ b/tools/mtmd/models/models.h @@ -16,6 +16,7 @@ struct clip_graph_gemma4v : clip_graph { clip_graph_gemma4v(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} ggml_cgraph * build() override; ggml_tensor * build_mm(ggml_tensor * w, ggml_tensor * x) const override; + bool support_batch() const override { return true; } }; struct clip_graph_gemma4uv : clip_graph { @@ -79,6 +80,7 @@ struct clip_graph_minicpmv4_6 : clip_graph { struct clip_graph_internvl : clip_graph { clip_graph_internvl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} ggml_cgraph * build() override; + bool support_batch() const override { return true; } }; struct clip_graph_nemotron_v2_vl : clip_graph { diff --git a/tools/mtmd/models/pixtral.cpp b/tools/mtmd/models/pixtral.cpp index d6d037b69412..edfae0825360 100644 --- a/tools/mtmd/models/pixtral.cpp +++ b/tools/mtmd/models/pixtral.cpp @@ -63,8 +63,8 @@ ggml_cgraph * clip_graph_pixtral::build() { // and then concatenate the [IMG_BREAK] token to the end of each row, aka n_patches_per_row dimension // after the concatenation, we have a tensor with shape [n_embd, n_patches_per_row + 1, n_rows] - const int p_y = n_merge > 0 ? n_patches_y / n_merge : n_patches_y; - const int p_x = n_merge > 0 ? n_patches_x / n_merge : n_patches_x; + const int p_y = n_patches_y / n_merge; + const int p_x = n_patches_x / n_merge; const int p_total = p_x * p_y; const int n_embd_text = cur->ne[0]; const int n_tokens_output = p_total + p_y - 1; // one [IMG_BREAK] per row, except the last row diff --git a/tools/mtmd/mtmd-audio.cpp b/tools/mtmd/mtmd-audio.cpp index 13f211fd9021..b72fd067a508 100644 --- a/tools/mtmd/mtmd-audio.cpp +++ b/tools/mtmd/mtmd-audio.cpp @@ -32,8 +32,8 @@ void mtmd_audio_cache::fill_hann_window(uint32_t length, bool periodic) { } } -void mtmd_audio_cache::fill_mel_filterbank_matrix(int n_mel, - int n_fft, +void mtmd_audio_cache::fill_mel_filterbank_matrix(int64_t n_mel, + int64_t n_fft, int sample_rate, float fmin, float fmax, @@ -86,11 +86,16 @@ void mtmd_audio_cache::fill_mel_filterbank_matrix(int n_mel, hz_pts[i] = mel_to_hz(mel_pts[i]); } - const int n_fft_bins = n_fft / 2 + 1; + const int64_t n_fft_bins = n_fft / 2 + 1; + + // Validate allocation size + if ((size_t)n_mel * (size_t)n_fft_bins > SIZE_MAX) { + GGML_ASSERT(false && "mel filterbank allocation too large"); + } // filterbank - std::vector out(n_mel * n_fft_bins, 0); - for (int m = 0; m < n_mel; ++m) { + std::vector out((size_t)n_mel * (size_t)n_fft_bins, 0); + for (int64_t m = 0; m < n_mel; ++m) { const double f_left = hz_pts[m]; const double f_center = hz_pts[m + 1]; const double f_right = hz_pts[m + 2]; @@ -266,8 +271,8 @@ static void ifft(const mtmd_audio_cache & cache, float * in, int N, float * out) } struct filter_params { - int32_t n_mel; - int32_t n_fft_bins; + int64_t n_mel; + int64_t n_fft_bins; int32_t hann_window_size; int32_t hop_length; int32_t sample_rate; @@ -293,8 +298,8 @@ static void log_mel_spectrogram_worker_thread(int ith, std::vector fft_in(frame_size * 2, 0.0); std::vector fft_out(frame_size * 2 * 2 * 2); - int n_fft_bins = params.n_fft_bins; - int i = ith; + int64_t n_fft_bins = params.n_fft_bins; + int64_t i = ith; const auto & filters = cache.filters; @@ -302,17 +307,18 @@ static void log_mel_spectrogram_worker_thread(int ith, GGML_ASSERT(n_fft_bins == 1 + (frame_size / 2)); GGML_ASSERT(cache.sin_vals.size() == cache.cos_vals.size()); // calculate FFT only when fft_in are not all zero - for (; i < std::min(n_samples / frame_step + 1, out.n_len); i += n_threads) { - const int offset = i * frame_step; + for (; i < std::min((int64_t)(n_samples / frame_step + 1), out.n_len); i += n_threads) { + const int64_t offset = i * frame_step; // apply Hann window (~10% faster) - for (int j = 0; j < std::min(frame_size, n_samples - offset); j++) { + const int valid_len = std::min(frame_size, std::max(0, n_samples - (int)offset)); + for (int j = 0; j < valid_len; j++) { fft_in[j] = hann[j] * samples[offset + j]; } // fill the rest with zeros - if (n_samples - offset < frame_size) { - std::fill(fft_in.begin() + (n_samples - offset), fft_in.end(), 0.0); + if (valid_len < frame_size) { + std::fill(fft_in.begin() + valid_len, fft_in.end(), 0.0); } // FFT @@ -325,7 +331,7 @@ static void log_mel_spectrogram_worker_thread(int ith, } // mel spectrogram - for (int j = 0; j < out.n_mel; j++) { + for (int64_t j = 0; j < out.n_mel; j++) { double sum = 0.0; // unroll loop (suggested by GH user @lunixbochs) int k = 0; @@ -339,21 +345,21 @@ static void log_mel_spectrogram_worker_thread(int ith, } // handle n_fft remainder for (; k < n_fft_bins; k++) { - sum += fft_out[k] * filters.data[j * n_fft_bins + k]; + sum += fft_out[k] * filters.data[(size_t)j * n_fft_bins + k]; } sum = std::max(sum, (double)params.mel_floor); sum = params.use_natural_log ? log(sum) : log10(sum); - out.data[j * out.n_len + i] = sum; + out.data[(size_t)j * out.n_len + i] = sum; } } // Otherwise fft_out are all zero double sum = params.use_natural_log ? log(1e-10) : log10(1e-10); for (; i < out.n_len; i += n_threads) { - for (int j = 0; j < out.n_mel; j++) { - out.data[j * out.n_len + i] = sum; + for (int64_t j = 0; j < out.n_mel; j++) { + out.data[(size_t)j * out.n_len + i] = sum; } } } @@ -437,16 +443,21 @@ static bool log_mel_spectrogram( GGML_ASSERT(params.hop_length > 0); out.n_mel = params.n_mel; out.n_len = (n_samples - frame_size) / frame_step + 1; - // TODO: handle these checks better - if (out.n_mel > 0 && (unsigned long)out.n_len > SIZE_MAX / out.n_mel) { - LOG_ERR("%s: size overflow\n", __func__); + // Validate dimensions before allocation to prevent integer overflow + if (out.n_mel <= 0 || out.n_len <= 0) { + LOG_ERR("%s: invalid mel dimensions n_mel=%lld n_len=%lld\n", __func__, (long long)out.n_mel, (long long)out.n_len); + return false; + } + const size_t total_size = (size_t)out.n_mel * (size_t)out.n_len; + if (total_size > SIZE_MAX / sizeof(float)) { + LOG_ERR("%s: size overflow: n_mel=%lld n_len=%lld\n", __func__, (long long)out.n_mel, (long long)out.n_len); return false; } if (n_samples < frame_size) { LOG_ERR("%s: not enough samples after padding\n", __func__); return false; } - out.data.resize(out.n_mel * out.n_len); + out.data.resize(total_size); { std::vector workers(n_threads - 1); @@ -464,38 +475,39 @@ static bool log_mel_spectrogram( } } - const int effective_n_len = n_samples_in / frame_step; + const int64_t effective_n_len = n_samples_in / frame_step; if (params.norm_per_feature) { GGML_ASSERT(effective_n_len > 1); - for (int i = 0; i < out.n_mel; i++) { + for (int64_t i = 0; i < out.n_mel; i++) { double mean = 0; - for (int j = 0; j < effective_n_len; ++j) { - mean += out.data[i * out.n_len + j]; + for (int64_t j = 0; j < effective_n_len; ++j) { + mean += out.data[(size_t)i * out.n_len + j]; } mean /= effective_n_len; double var = 0.0; - for (int j = 0; j < effective_n_len; ++j) { - const double value = out.data[i * out.n_len + j] - mean; + for (int64_t j = 0; j < effective_n_len; ++j) { + const double value = out.data[(size_t)i * out.n_len + j] - mean; var += value * value; } var /= effective_n_len - 1; // unbiased const double mstd = std::sqrt(var + 1e-5); - for (int j = 0; j < effective_n_len; ++j) { - auto &value = out.data[i * out.n_len + j]; + for (int64_t j = 0; j < effective_n_len; ++j) { + auto &value = out.data[(size_t)i * out.n_len + j]; value = (value - mean) / mstd; } // pad the rest with zeros - for (int j = effective_n_len; j < out.n_len; ++j) { - out.data[i * out.n_len + j] = 0.0; + for (int64_t j = effective_n_len; j < out.n_len; ++j) { + out.data[(size_t)i * out.n_len + j] = 0.0; } } } else if (!params.no_padding) { // Whisper-style clamping and normalization (NOT used by Gemma4) double mmax = -1e20; - for (int i = 0; i < out.n_mel*out.n_len; i++) { + const size_t mel_size = (size_t)out.n_mel * (size_t)out.n_len; + for (size_t i = 0; i < mel_size; i++) { if (out.data[i] > mmax) { mmax = out.data[i]; } @@ -503,7 +515,7 @@ static bool log_mel_spectrogram( mmax -= 8.0; - for (int i = 0; i < out.n_mel*out.n_len; i++) { + for (size_t i = 0; i < mel_size; i++) { if (out.data[i] < mmax) { out.data[i] = mmax; } @@ -582,13 +594,13 @@ bool mtmd_audio_preprocessor_whisper::preprocess(const float * s // because the cgraph in clip.cpp only accepts 3000 frames each, we need to split the mel // we always expect the mel to have 3000 silent frames at the end if (DEBUG) { - printf("output: n_mel = %d, n_len = %d\n", out_full.n_mel, out_full.n_len); + printf("output: n_mel = %d, n_len = %d\n", (int) out_full.n_mel, (int) out_full.n_len); } const size_t frames_per_chunk = 3000; GGML_ASSERT((size_t) out_full.n_len > frames_per_chunk); for (size_t off = 0; off < (size_t) out_full.n_len; off += frames_per_chunk) { - int n_len = std::min(frames_per_chunk, (size_t) out_full.n_len - off); - if ((size_t) n_len < frames_per_chunk) { + int64_t n_len = std::min((int64_t)frames_per_chunk, out_full.n_len - (int64_t)off); + if (n_len < (int64_t)frames_per_chunk) { break; // last incomplete chunk will always be a padded chunk, safe to ignore } @@ -596,10 +608,10 @@ bool mtmd_audio_preprocessor_whisper::preprocess(const float * s out_chunk.n_len = n_len; out_chunk.n_mel = out_full.n_mel; out_chunk.n_len_org = out_full.n_mel; // unused - out_chunk.data.reserve(out_chunk.n_mel * out_chunk.n_len); + out_chunk.data.reserve((size_t)out_chunk.n_mel * (size_t)out_chunk.n_len); - for (int i = 0; i < out_full.n_mel; i++) { - auto src = out_full.data.begin() + i * out_full.n_len + off; + for (int64_t i = 0; i < out_full.n_mel; i++) { + auto src = out_full.data.begin() + (size_t)i * out_full.n_len + off; out_chunk.data.insert(out_chunk.data.end(), src, src + frames_per_chunk); } @@ -681,8 +693,8 @@ bool mtmd_audio_preprocessor_qwen3a::preprocess(const float * sa // The effective frame count: center-padded STFT gives ~n_samples/hop_length frames. // We take min(mel_full.n_len, n_samples/hop + 1) to avoid including excess frames. - const int n_eff = std::min(mel_full.n_len, - (int)(n_samples / hparams.audio_hop_len) + 1); + const int64_t n_eff = std::min(mel_full.n_len, + (int64_t)(n_samples / hparams.audio_hop_len) + 1); // Split into inference windows matching n_window_infer=800 from model config. // Each window is padded to the next multiple of chunk_size for the cgraph. @@ -690,18 +702,18 @@ bool mtmd_audio_preprocessor_qwen3a::preprocess(const float * sa const int chunk_size = 100; // conv sub-chunk size (n_window * 2, n_window=50) const int window_size = 800; // mel frames per forward pass (n_window_infer=800) - for (int off = 0; off < n_eff; off += window_size) { - const int win_eff = std::min(window_size, n_eff - off); - const int n_chunks = (win_eff + chunk_size - 1) / chunk_size; - const int n_padded = n_chunks * chunk_size; + for (int64_t off = 0; off < n_eff; off += window_size) { + const int64_t win_eff = std::min((int64_t)window_size, n_eff - off); + const int64_t n_chunks = (win_eff + chunk_size - 1) / chunk_size; + const int64_t n_padded = n_chunks * chunk_size; mtmd_audio_mel out; out.n_mel = mel_full.n_mel; out.n_len = n_padded; out.n_len_org = win_eff; - out.data.assign(out.n_mel * out.n_len, 0.0f); - for (int m = 0; m < out.n_mel; m++) { - const int copy_len = std::min(win_eff, mel_full.n_len - off); + out.data.assign((size_t)out.n_mel * (size_t)out.n_len, 0.0f); + for (int64_t m = 0; m < out.n_mel; m++) { + const int64_t copy_len = std::min((int64_t)win_eff, mel_full.n_len - off); if (copy_len > 0) { std::copy(mel_full.data.begin() + (size_t)m * mel_full.n_len + off, mel_full.data.begin() + (size_t)m * mel_full.n_len + off + copy_len, @@ -823,37 +835,38 @@ bool mtmd_audio_preprocessor_granite_speech::preprocess(const float * } double mmax = -1e20; - for (int i = 0; i < mel.n_mel * mel.n_len; i++) { + const size_t mel_size = (size_t)mel.n_mel * (size_t)mel.n_len; + for (size_t i = 0; i < mel_size; i++) { if (mel.data[i] > mmax) { mmax = mel.data[i]; } } mmax -= 8.0; - for (int i = 0; i < mel.n_mel * mel.n_len; i++) { + for (size_t i = 0; i < mel_size; i++) { if (mel.data[i] < mmax) { mel.data[i] = mmax; } mel.data[i] = (mel.data[i] + 4.0) / 4.0; } - int n_frames = mel.n_len; + int64_t n_frames = mel.n_len; if (n_frames % 2 == 1) { n_frames--; } - const int n_mel = mel.n_mel; - const int n_stacked = n_frames / 2; + const int64_t n_mel = mel.n_mel; + const int64_t n_stacked = n_frames / 2; mtmd_audio_mel stacked; stacked.n_mel = 2 * n_mel; stacked.n_len = n_stacked; - stacked.n_len_org = (int)n_samples; - stacked.data.resize(2 * n_mel * n_stacked); + stacked.n_len_org = (int64_t)n_samples; + stacked.data.resize((size_t)2 * (size_t)n_mel * (size_t)n_stacked); - for (int t = 0; t < n_stacked; t++) { - for (int m = 0; m < n_mel; m++) { - stacked.data[m * n_stacked + t] = mel.data[m * mel.n_len + 2 * t]; - stacked.data[(m + n_mel) * n_stacked + t] = mel.data[m * mel.n_len + 2 * t + 1]; + for (int64_t t = 0; t < n_stacked; t++) { + for (int64_t m = 0; m < n_mel; m++) { + stacked.data[(size_t)m * n_stacked + t] = mel.data[(size_t)m * mel.n_len + 2 * t]; + stacked.data[(size_t)(m + n_mel) * n_stacked + t] = mel.data[(size_t)m * mel.n_len + 2 * t + 1]; } } @@ -921,8 +934,8 @@ bool mtmd_audio_preprocessor_gemma4a::preprocess(const float * s const int hop = hparams.audio_hop_len; const int n_with_left = (int)chunk_len + pad_left; // PyTorch: unfold(size=frame_length+1, step=hop) on semicausal-padded waveform - const int pt_frames = (n_with_left - (hparams.audio_window_len + 1)) / hop + 1; - const int n_padded_needed = (pt_frames - 1) * hop + fft_size; + const int64_t pt_frames = (n_with_left - (hparams.audio_window_len + 1)) / hop + 1; + const int64_t n_padded_needed = (pt_frames - 1) * hop + fft_size; const int total_pad = std::max((int)(n_padded_needed - (int)chunk_len), pad_left); std::vector padded_samples(total_pad + chunk_len, 0.0f); std::copy(chunk_ptr, chunk_ptr + chunk_len, padded_samples.data() + pad_left); diff --git a/tools/mtmd/mtmd-audio.h b/tools/mtmd/mtmd-audio.h index 9656e3940f53..ad96bd847cfc 100644 --- a/tools/mtmd/mtmd-audio.h +++ b/tools/mtmd/mtmd-audio.h @@ -10,16 +10,16 @@ #define MTMD_INTERNAL_HEADER struct mtmd_audio_mel { - int n_len; - int n_len_org; - int n_mel; + int64_t n_len; + int64_t n_len_org; + int64_t n_mel; std::vector data; }; struct mtmd_audio_mel_filters { - int32_t n_mel; - int32_t n_fft; + int64_t n_mel; + int64_t n_fft; std::vector data; }; @@ -39,8 +39,8 @@ struct mtmd_audio_cache { // Build mel filterbank matrix [n_mel × n_fft_bins] at runtime. // n_fft_bins must be (N_fft / 2 + 1). Example: if N_fft=512 -> n_fft_bins=257. - void fill_mel_filterbank_matrix(int n_mel, - int n_fft, + void fill_mel_filterbank_matrix(int64_t n_mel, + int64_t n_fft, int sample_rate, // e.g. 16000 float fmin = 0.0f, // e.g. 0.0 float fmax = -1.0f, // e.g. sr/2; pass -1 for auto diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp index a3cad7cd0683..8704ea79d7a0 100644 --- a/tools/mtmd/mtmd-cli.cpp +++ b/tools/mtmd/mtmd-cli.cpp @@ -32,9 +32,9 @@ static volatile bool g_is_generating = false; static volatile bool g_is_interrupted = false; /** - * Please note that this is NOT a production-ready stuff. + * Please note that this is NOT a production-ready binary. * It is a playground for trying multimodal support in llama.cpp. - * For contributors: please keep this code simple and easy to understand. + * For contributors: please keep this code simple and easy to understand. Do not add unnecessary complexity. The goal is to have a simple CLI for testing multimodal support. */ static void show_additional_info(int /*argc*/, char ** argv) { @@ -65,6 +65,14 @@ static void sigint_handler(int signo) { } #endif +// this is only used by tests.sh to capture the response ; it's not meant to be used in production +static void inject_test_response_marker() { + const char * env = std::getenv("MTMD_TEST_RESPONSE_MARKER"); + if (env) { + LOG("%s\n", env); + } +} + struct mtmd_cli_context { mtmd::context_ptr ctx_vision; common_init_result_ptr llama_init; @@ -79,6 +87,8 @@ struct mtmd_cli_context { mtmd::bitmaps bitmaps; std::vector videos; + mtmd::batch_ptr mbatch; + // chat template common_chat_templates_ptr tmpls; std::vector chat_history; @@ -233,6 +243,8 @@ static std::string chat_add_and_format(mtmd_cli_context & ctx, common_chat_msg & } static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg) { + inject_test_response_marker(); + bool add_bos = ctx.chat_history.empty(); auto formatted_chat = chat_add_and_format(ctx, msg); LOG_DBG("formatted_chat.prompt: %s\n", formatted_chat.c_str()); @@ -259,20 +271,95 @@ static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg) { ctx.bitmaps.entries.clear(); ctx.videos.clear(); - llama_pos new_n_past; - if (mtmd_helper_eval_chunks(ctx.ctx_vision.get(), - ctx.lctx, // lctx - chunks.ptr.get(), // chunks - ctx.n_past, // n_past - 0, // seq_id - ctx.n_batch, // n_batch - true, // logits_last - &new_n_past)) { - LOG_ERR("Unable to eval prompt\n"); - return 1; - } + // batch encode all media chunks, then decode each + size_t n_chunks = mtmd_input_chunks_size(chunks.ptr.get()); + for (size_t i = 0; i < n_chunks; i++) { + auto chunk = mtmd_input_chunks_get(chunks.ptr.get(), i); + auto chunk_type = mtmd_input_chunk_get_type(chunk); + + if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) { + // decode text chunk + llama_pos new_n_past = ctx.n_past; + res = mtmd_helper_eval_chunk_single(ctx.ctx_vision.get(), + ctx.lctx, + chunk, + ctx.n_past, + 0, // seq_id + ctx.n_batch, + i == n_chunks - 1, // logits_last + &new_n_past); + if (res != 0) { + LOG_ERR("Unable to eval text chunk %zu\n", i); + return 1; + } + ctx.n_past = new_n_past; + } else { + // media chunk: try to get embd from existing batch, or create a new batch + float * embd = nullptr; + if (ctx.mbatch) { + embd = mtmd_batch_get_output_embd(ctx.mbatch.get(), chunk); + + if (embd) { + LOG_DBG("found embd for media chunk %zu in existing batch\n", i); + } else { + LOG_DBG("media chunk %zu not found in existing batch, creating new batch\n", i); + } + } - ctx.n_past = new_n_past; + if (!embd) { + // create and encode a new batch with as many media chunks as possible + ctx.mbatch.reset(mtmd_batch_init(ctx.ctx_vision.get())); + res = mtmd_batch_add_chunk(ctx.mbatch.get(), chunk); + GGML_ASSERT(res == 0); // first chunk must always succeed + + int n_added = 1; + // add as many subsequent media chunks as possible + for (size_t j = i + 1; j < n_chunks; j++) { + auto next_chunk = mtmd_input_chunks_get(chunks.ptr.get(), j); + auto next_type = mtmd_input_chunk_get_type(next_chunk); + if (next_type == MTMD_INPUT_CHUNK_TYPE_TEXT) { + break; // text chunk splits the batch + } + res = mtmd_batch_add_chunk(ctx.mbatch.get(), next_chunk); + if (res != 0) { + break; // batch full or incompatible + } + n_added++; + } + + int64_t time_start = ggml_time_ms(); + LOG_INF("encoding mtmd batch, n_chunks = %d (done = %zu, total = %zu)\n", n_added, i, n_chunks); + res = mtmd_batch_encode(ctx.mbatch.get()); + if (res != 0) { + LOG_ERR("Failed to encode mtmd batch, res = %d\n", res); + return 1; + } + LOG_INF("mtmd batch encoding done in %d ms\n", (int)(ggml_time_ms() - time_start)); + + embd = mtmd_batch_get_output_embd(ctx.mbatch.get(), chunk); + } + + GGML_ASSERT(embd != nullptr); + + llama_pos new_n_past = ctx.n_past; + res = mtmd_helper_decode_image_chunk(ctx.ctx_vision.get(), + ctx.lctx, + chunk, + embd, + ctx.n_past, + 0, // seq_id + ctx.n_batch, + &new_n_past, + nullptr, // callback + nullptr // user_data + ); + if (res != 0) { + LOG_ERR("Unable to decode media chunk %zu\n", i); + return 1; + } + ctx.n_past = new_n_past; + } + } LOG("\n"); @@ -309,6 +396,9 @@ int main(int argc, char ** argv) { int n_predict = params.n_predict < 0 ? INT_MAX : params.n_predict; + console::init(params.simple_io, params.use_color); + atexit([]() { console::cleanup(); }); + // Ctrl+C handling { #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) diff --git a/tools/mtmd/mtmd-helper.cpp b/tools/mtmd/mtmd-helper.cpp index 2d11a33804a3..3c73db4431e7 100644 --- a/tools/mtmd/mtmd-helper.cpp +++ b/tools/mtmd/mtmd-helper.cpp @@ -247,7 +247,9 @@ int32_t mtmd_helper_decode_image_chunk( llama_pos n_past, llama_seq_id seq_id, int32_t n_batch, - llama_pos * new_n_past) { + llama_pos * new_n_past, + mtmd_helper_post_decode_callback callback, + void * user_data) { GGML_ASSERT(n_batch > 0); auto chunk_type = mtmd_input_chunk_get_type(chunk); const char * name = chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE ? "image" : "audio"; @@ -302,10 +304,23 @@ int32_t mtmd_helper_decode_image_chunk( int32_t ret = llama_decode(lctx, batch_embd_view); if (ret != 0) { LOG_ERR("failed to decode %s\n", name); - llama_set_causal_attn(lctx, true); // restore causal attn + if (use_non_causal) { + llama_set_causal_attn(lctx, true); + } return ret; } + if (callback != nullptr) { + ret = callback(batch_embd_view, user_data); + if (ret != 0) { + LOG_ERR("post-decode callback failed\n"); + if (use_non_causal) { + llama_set_causal_attn(lctx, true); + } + return ret; + } + } + LOG_INF("%s decoded (batch %d/%d) in %" PRId64 " ms\n", name, i_batch+1, n_img_batches, ggml_time_ms() - t1); i_batch++; @@ -379,7 +394,7 @@ int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx, LOG_INF("%s slice encoded in %" PRId64 " ms\n", name, ggml_time_ms() - t0); float * embd = mtmd_get_output_embd(ctx); - ret = mtmd_helper_decode_image_chunk(ctx, lctx, chunk, embd, n_past, seq_id, n_batch, new_n_past); + ret = mtmd_helper_decode_image_chunk(ctx, lctx, chunk, embd, n_past, seq_id, n_batch, new_n_past, nullptr, nullptr); if (ret != 0) { LOG_ERR("failed to decode %s\n", name); llama_batch_free(text_batch); @@ -567,13 +582,29 @@ mtmd_helper_bitmap_wrapper mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, } mtmd_helper_bitmap_wrapper mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname, bool placeholder) { - std::vector buf; +#ifdef _WIN32 + int wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, NULL, 0); + if (!wlen) { + LOG_ERR("Unable to convert filename to UTF-16: %s\n", fname); + return {nullptr, nullptr}; + } + std::vector wfname(wlen); + wlen = MultiByteToWideChar(CP_UTF8, 0, fname, -1, wfname.data(), wlen); + if (!wlen) { + LOG_ERR("Unable to convert filename to UTF-16: %s\n", fname); + return {nullptr, nullptr}; + } + FILE * f = _wfopen(wfname.data(), L"rb"); +#else FILE * f = fopen(fname, "rb"); +#endif if (!f) { LOG_ERR("Unable to open file %s: %s\n", fname, strerror(errno)); return {nullptr, nullptr}; } + std::vector buf; + fseek(f, 0, SEEK_END); long file_size = ftell(f); fseek(f, 0, SEEK_SET); diff --git a/tools/mtmd/mtmd-helper.h b/tools/mtmd/mtmd-helper.h index 164b7c6689d9..680a2317df0e 100644 --- a/tools/mtmd/mtmd-helper.h +++ b/tools/mtmd/mtmd-helper.h @@ -67,8 +67,8 @@ MTMD_API void mtmd_helper_image_get_decoder_pos(const mtmd_image_tokens * image, // helper function that automatically: // 1. run llama_decode() on text chunks -// 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode() -// if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error +// 2. run mtmd_encode_chunk() on image chunks, then mtmd_get_output_embd() and then llama_decode() +// if any of the mtmd_encode_chunk() or llama_decode() calls return non-zero, stop and forward the error // otherwise, returns 0 on success // this function is NOT thread-safe MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx, @@ -91,6 +91,8 @@ MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx, bool logits_last, llama_pos * new_n_past); +typedef int32_t (*mtmd_helper_post_decode_callback)(struct llama_batch batch, void * user_data); + // helper function to decode an image whose embeddings have already been calculated // this helper will handle batching and pre/post decoding setup (for ex. gemma 3 requires non-causal attention) // ret 0 on success, -1 on chunk not being a valid image chunk, 1 on decode failure @@ -101,7 +103,9 @@ MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx, llama_pos n_past, llama_seq_id seq_id, int32_t n_batch, - llama_pos * new_n_past); + llama_pos * new_n_past, + mtmd_helper_post_decode_callback callback, + void * user_data); // // video input helpers (requires ffmpeg/ffprobe installed on the system) @@ -157,13 +161,16 @@ MTMD_API int32_t mtmd_helper_video_read_next(mtmd_helper_video * ctx, } // extern "C" #endif +#ifdef __cplusplus +#include +#include + +namespace mtmd_helper { + // // C++ wrappers // -#ifdef __cplusplus -namespace mtmd_helper { - // video-related C++ wrappers struct mtmd_helper_video_deleter { void operator()(mtmd_helper_video * val) { mtmd_helper_video_free(val); } diff --git a/tools/mtmd/mtmd-image.cpp b/tools/mtmd/mtmd-image.cpp index bedf44e07cf3..01d9b4517a8b 100644 --- a/tools/mtmd/mtmd-image.cpp +++ b/tools/mtmd/mtmd-image.cpp @@ -4,17 +4,33 @@ #include #include -// -// base implementation -// +void mtmd_image_preproc_out::append(const clip_hparams & hparams, const clip_image_u8 & img, bool normalized) { + clip_image_f32 dst; + dst.from_u8(img); + if (normalized) { + dst.normalize(hparams.image_mean, hparams.image_std); + } + entries.push_back(std::move(dst)); +} + +void mtmd_image_preproc_out::append(const clip_hparams & hparams, const std::vector & imgs, bool normalized) { + for (const auto & img : imgs) { + append(hparams, img, normalized); + } +} -void mtmd_image_preprocessor::img_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]) { - dst.from_u8(src); - dst.normalize(mean, std); +void mtmd_image_preproc_out::append(const clip_hparams & hparams, clip_image_f32 & img, bool normalized) { + if (normalized) { + img.normalize(hparams.image_mean, hparams.image_std); + } + entries.push_back(std::move(img)); } -void mtmd_image_preprocessor::img_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst) { - dst.from_u8(src); +void mtmd_image_preproc_out::append_overview(const clip_hparams & hparams, const clip_image_u8 & img, bool normalized) { + overview.from_u8(img); + if (normalized) { + overview.normalize(hparams.image_mean, hparams.image_std); + } } // set of tools to manipulate images @@ -595,27 +611,24 @@ struct img_tool { // mtmd_image_preprocessor_llava_uhd // -bool mtmd_image_preprocessor_llava_uhd::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) { +mtmd_image_preproc_out mtmd_image_preprocessor_llava_uhd::preprocess(const clip_image_u8 & img) { const clip_image_size original_size = img.get_size(); auto const inst = get_slice_instructions(original_size); - std::vector imgs = slice_image(img, inst); - - for (size_t i = 0; i < imgs.size(); ++i) { - // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp"); - clip_image_f32_ptr res(clip_image_f32_init()); - img_u8_to_f32(*imgs[i], *res, hparams.image_mean, hparams.image_std); - output.entries.push_back(std::move(res)); - } + auto sliced = slice_image(img, inst); + mtmd_image_preproc_out output; + output.append_overview(hparams, sliced.overview, true); + output.append(hparams, sliced.slices, true); output.grid_x = inst.grid_size.width; output.grid_y = inst.grid_size.height; - return true; + + return output; } mtmd_image_preprocessor_llava_uhd::slice_instructions mtmd_image_preprocessor_llava_uhd::get_slice_instructions(const clip_image_size & original_size) { mtmd_image_preprocessor_llava_uhd::slice_instructions res; // align slices by patch_size * n_merge so an integer number of merger output tokens fits per slice - const int n_merge = hparams.n_merge > 0 ? hparams.n_merge : 1; + const int n_merge = hparams.n_merge; const int patch_size = hparams.patch_size * n_merge; const int slice_size = hparams.image_size; const int original_width = original_size.width; @@ -717,28 +730,21 @@ mtmd_image_preprocessor_llava_uhd::slice_instructions mtmd_image_preprocessor_ll return res; } -std::vector mtmd_image_preprocessor_llava_uhd::slice_image(const clip_image_u8 & img, const mtmd_image_preprocessor_llava_uhd::slice_instructions & inst, bool overview_first) { - std::vector output; +mtmd_image_preprocessor_llava_uhd::slice_output mtmd_image_preprocessor_llava_uhd::slice_image(const clip_image_u8 & img, const mtmd_image_preprocessor_llava_uhd::slice_instructions & inst) { + slice_output output; // resize to overview size - clip_image_u8_ptr resized_img(clip_image_u8_init()); - img_tool::resize(img, *resized_img, inst.overview_size, hparams.image_resize_algo_ov, + img_tool::resize(img, output.overview, inst.overview_size, hparams.image_resize_algo_ov, hparams.image_pad_ov, hparams.image_pad_color_ov); - if (overview_first) { - output.push_back(std::move(resized_img)); - } if (inst.slices.empty()) { - // no slices, just return the resized image - if (!overview_first) { - output.push_back(std::move(resized_img)); - } + // no slices, just return the overview image return output; } // resize to refined size - clip_image_u8_ptr refined_img(clip_image_u8_init()); - img_tool::resize(img, *refined_img, inst.refined_size, hparams.image_resize_algo_rf, + clip_image_u8 refined_img; + img_tool::resize(img, refined_img, inst.refined_size, hparams.image_resize_algo_rf, hparams.image_pad_rf, hparams.image_pad_color_rf); // create slices @@ -748,13 +754,9 @@ std::vector mtmd_image_preprocessor_llava_uhd::slice_image(co int w = slice.size.width; int h = slice.size.height; - clip_image_u8_ptr img_slice(clip_image_u8_init()); - img_tool::crop(*refined_img, *img_slice, x, y, w, h); - output.push_back(std::move(img_slice)); - } - - if (!overview_first) { - output.push_back(std::move(resized_img)); + clip_image_u8 img_slice; + img_tool::crop(refined_img, img_slice, x, y, w, h); + output.slices.push_back(std::move(img_slice)); } return output; @@ -871,29 +873,28 @@ clip_image_size mtmd_image_preprocessor_llava_uhd::get_best_grid(const int max_s // mtmd_image_preprocessor_fixed_size // -bool mtmd_image_preprocessor_fixed_size::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) { +mtmd_image_preproc_out mtmd_image_preprocessor_fixed_size::preprocess(const clip_image_u8 & img) { clip_image_u8 resized_image; int sz = hparams.image_size; img_tool::resize(img, resized_image, {sz, sz}, hparams.image_resize_algo, hparams.image_resize_pad, hparams.image_pad_color); - clip_image_f32_ptr img_f32(clip_image_f32_init()); - img_u8_to_f32(resized_image, *img_f32, hparams.image_mean, hparams.image_std); - output.entries.push_back(std::move(img_f32)); - return true; + mtmd_image_preproc_out output; + output.append(hparams, resized_image, true); + return output; } // // mtmd_image_preprocessor_dyn_size // -bool mtmd_image_preprocessor_dyn_size::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) { +mtmd_image_preproc_out mtmd_image_preprocessor_dyn_size::preprocess(const clip_image_u8 & img) { GGML_ASSERT(hparams.image_min_pixels > 0 && hparams.image_max_pixels > 0); clip_image_u8 resized_image; const clip_image_size original_size = img.get_size(); // the original pixtral model doesn't have n_merge - const int cur_merge = hparams.n_merge == 0 ? 1 : hparams.n_merge; + const int cur_merge = hparams.n_merge; const clip_image_size target_size = img_tool::calc_size_preserved_ratio( original_size, hparams.patch_size * cur_merge, @@ -903,17 +904,16 @@ bool mtmd_image_preprocessor_dyn_size::preprocess(const clip_image_u8 & img, cli hparams.image_resize_algo, hparams.image_resize_pad, hparams.image_pad_color); - clip_image_f32_ptr img_f32(clip_image_f32_init()); - img_u8_to_f32(resized_image, *img_f32, hparams.image_mean, hparams.image_std); - output.entries.push_back(std::move(img_f32)); - return true; + mtmd_image_preproc_out output; + output.append(hparams, resized_image, true); + return output; } // // mtmd_image_preprocessor_longest_edge // -bool mtmd_image_preprocessor_longest_edge::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) { +mtmd_image_preproc_out mtmd_image_preprocessor_longest_edge::preprocess(const clip_image_u8 & img) { GGML_ASSERT(hparams.image_longest_edge > 0); clip_image_u8 resized_image; const clip_image_size original_size = img.get_size(); @@ -927,10 +927,9 @@ bool mtmd_image_preprocessor_longest_edge::preprocess(const clip_image_u8 & img, hparams.image_resize_algo, hparams.image_resize_pad, hparams.image_pad_color); - clip_image_f32_ptr img_f32(clip_image_f32_init()); - img_u8_to_f32(resized_image, *img_f32, hparams.image_mean, hparams.image_std); - output.entries.push_back(std::move(img_f32)); - return true; + mtmd_image_preproc_out output; + output.append(hparams, resized_image, true); + return output; } // @@ -1040,7 +1039,7 @@ clip_image_size mtmd_image_preprocessor_lfm2::get_grid_layout(int height, int wi // mtmd_image_preprocessor_idefics3 // -bool mtmd_image_preprocessor_idefics3::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) { +mtmd_image_preproc_out mtmd_image_preprocessor_idefics3::preprocess(const clip_image_u8 & img) { // The refined size has two steps: // 1. Resize w/ aspect-ratio preserving such that the longer side is // the preprocessor longest size @@ -1075,44 +1074,40 @@ bool mtmd_image_preprocessor_idefics3::preprocess(const clip_image_u8 & img, cli }); } } - auto imgs = slice_image(img, instructions); - - // cast and normalize to f32 - for (size_t i = 0; i < imgs.size(); ++i) { - // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp"); - clip_image_f32_ptr res(clip_image_f32_init()); - img_u8_to_f32(*imgs[i], *res, hparams.image_mean, hparams.image_std); - output.entries.push_back(std::move(res)); - } + auto sliced = slice_image(img, instructions); + mtmd_image_preproc_out output; + output.append_overview(hparams, sliced.overview, true); + output.append(hparams, sliced.slices, true); output.grid_x = instructions.grid_size.width; output.grid_y = instructions.grid_size.height; - return true; + return output; } // // mtmd_image_preprocessor_internvl // -bool mtmd_image_preprocessor_internvl::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) { +mtmd_image_preproc_out mtmd_image_preprocessor_internvl::preprocess(const clip_image_u8 & img) { GGML_ASSERT(!hparams.image_res_candidates.empty()); const clip_image_size original_size = img.get_size(); auto const inst = get_slice_instructions(original_size); - std::vector imgs = slice_image(img, inst, false); + auto sliced = slice_image(img, inst); - for (size_t i = 0; i < imgs.size(); ++i) { - clip_image_f32_ptr res(clip_image_f32_init()); - img_u8_to_f32(*imgs[i], *res, hparams.image_mean, hparams.image_std); - output.entries.push_back(std::move(res)); - } - return true; + mtmd_image_preproc_out output; + // InternVL: slices first, then overview + output.append(hparams, sliced.slices, true); + output.append_overview(hparams, sliced.overview, true); + output.grid_x = inst.grid_size.width; + output.grid_y = inst.grid_size.height; + return output; } // // mtmd_image_preprocessor_deepseekocr // -bool mtmd_image_preprocessor_deepseekocr::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) { +mtmd_image_preproc_out mtmd_image_preprocessor_deepseekocr::preprocess(const clip_image_u8 & img) { static constexpr int native_resolutions[] = { 1024 /* base */, 1280 /* large */ }; // TODO: support 512 (tiny) and 640 (small) once we have eval data for them @@ -1135,14 +1130,12 @@ bool mtmd_image_preprocessor_deepseekocr::preprocess(const clip_image_u8 & img, clip_image_u8 padded; img_tool::resize(img, padded, {image_size, image_size}, RESIZE_ALGO_BICUBIC_PILLOW, PAD_NEAREST, hparams.image_pad_color); - - clip_image_f32_ptr res(clip_image_f32_init()); - img_u8_to_f32(padded, *res, hparams.image_mean, hparams.image_std); - output.entries.push_back(std::move(res)); - - output.grid_x = 1; - output.grid_y = 1; - return true; + mtmd_image_preproc_out output; + output.append_overview(hparams, padded, true); + output.grid_x = 0; + output.grid_y = 0; + // TODO @ngxson : support slicing for DeepSeek-OCR, to do in another PR + return output; } // @@ -1205,10 +1198,11 @@ clip_image_size mtmd_image_preprocessor_deepseekocr2::find_closest_aspect_ratio( return best_ratio; } -bool mtmd_image_preprocessor_deepseekocr2::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) { +mtmd_image_preproc_out mtmd_image_preprocessor_deepseekocr2::preprocess(const clip_image_u8 & img) { // emit 768x768 local tiles when the image is larger than a tile in either // dimension, then always a 1024x1024 global view. order: [tiles..., global]. + mtmd_image_preproc_out output; const auto img_size = img.get_size(); if (img_size.width > tile_size || img_size.height > tile_size) { const float aspect_ratio = static_cast(img_size.width) / img_size.height; @@ -1224,9 +1218,7 @@ bool mtmd_image_preprocessor_deepseekocr2::preprocess(const clip_image_u8 & img, for (int col = 0; col < grid.width; col++) { clip_image_u8 tile; img_tool::crop(refined, tile, col * tile_size, row * tile_size, tile_size, tile_size); - clip_image_f32_ptr res(clip_image_f32_init()); - img_u8_to_f32(tile, *res, hparams.image_mean, hparams.image_std); - output.entries.push_back(std::move(res)); + output.append(hparams, tile, true); } } } @@ -1235,14 +1227,9 @@ bool mtmd_image_preprocessor_deepseekocr2::preprocess(const clip_image_u8 & img, clip_image_u8 padded; img_tool::resize(img, padded, { base_size, base_size }, RESIZE_ALGO_BICUBIC_PILLOW, PAD_NEAREST, hparams.image_pad_color); - clip_image_f32_ptr global(clip_image_f32_init()); - img_u8_to_f32(padded, *global, hparams.image_mean, hparams.image_std); - global->add_viewsep = true; - output.entries.push_back(std::move(global)); - - output.grid_x = 1; - output.grid_y = 1; - return true; + output.append_overview(hparams, padded, true); + output.overview.add_viewsep = true; + return output; } // @@ -1258,7 +1245,8 @@ void mtmd_image_preprocessor_step3vl::img_u8_resize_bilinear_to_f32( const float std[3]) { const auto src_size = src.get_size(); if (src_size.width == target_width && src_size.height == target_height) { - img_u8_to_f32(src, dst, mean, std); + dst.from_u8(src); + dst.normalize(mean, std); return; } @@ -1453,24 +1441,24 @@ mtmd_image_preprocessor_step3vl::slice_instructions mtmd_image_preprocessor_step return instructions; } -bool mtmd_image_preprocessor_step3vl::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) { +mtmd_image_preproc_out mtmd_image_preprocessor_step3vl::preprocess(const clip_image_u8 & img) { clip_image_u8 prepared = prepare_image(img, hparams); const auto instructions = build_slice_instructions(hparams, prepared.get_size()); - clip_image_f32_ptr overview_f32(clip_image_f32_init()); + mtmd_image_preproc_out output; + // overview (normalized f32, already includes mean/std) img_u8_resize_bilinear_to_f32( prepared, - *overview_f32, + output.overview, hparams.image_size, hparams.image_size, hparams.image_mean, hparams.image_std); - output.entries.push_back(std::move(overview_f32)); if (instructions.slices.empty()) { output.grid_x = 0; output.grid_y = 0; - return true; + return output; } clip_image_u8 img_for_crop = prepared; @@ -1486,28 +1474,28 @@ bool mtmd_image_preprocessor_step3vl::preprocess(const clip_image_u8 & img, clip // If the requested patch extends past the source image, pad the out-of-bounds area with black. clip_image_u8 patch = crop_with_black_padding(img_for_crop, slice.x, slice.y, slice.size.width, slice.size.height); - clip_image_f32_ptr patch_f32(clip_image_f32_init()); + clip_image_f32 patch_f32; img_u8_resize_bilinear_to_f32( patch, - *patch_f32, + patch_f32, crop_size, crop_size, hparams.image_mean, hparams.image_std); - output.entries.push_back(std::move(patch_f32)); + output.append(hparams, patch_f32, false); } output.grid_x = instructions.grid_size.width; output.grid_y = instructions.grid_size.height; - return true; + return output; } // // mtmd_image_preprocessor_youtuvl // -bool mtmd_image_preprocessor_youtuvl::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) { +mtmd_image_preproc_out mtmd_image_preprocessor_youtuvl::preprocess(const clip_image_u8 & img) { const int patch_size = hparams.patch_size; // typically 16 const int merge_size = hparams.n_merge; // typically 2 const int align_size = patch_size * merge_size; // 32 @@ -1551,10 +1539,22 @@ bool mtmd_image_preprocessor_youtuvl::preprocess(const clip_image_u8 & img, clip clip_image_u8 resized; img_tool::resize(img, resized, new_size, hparams.image_resize_algo, hparams.image_resize_pad); - // Normalize to float32 - clip_image_f32_ptr img_f32(clip_image_f32_init()); - img_u8_to_f32(resized, *img_f32, hparams.image_mean, hparams.image_std); - // Add to results - output.entries.push_back(std::move(img_f32)); - return true; + mtmd_image_preproc_out output; + output.append(hparams, resized, true); + return output; +} + +mtmd_image_preproc_out mtmd_image_preprocessor_granite::preprocess(const clip_image_u8 & img) { + auto output = mtmd_image_preprocessor_llava_uhd::preprocess(img); + if (output.entries.size() == 0) { + // Single-tile (overview only): append one newline row. + output.overview.add_newline = true; + } else { + // Multi-tile: overview gets no newline, grid tiles get one. + output.overview.add_newline = false; + for (size_t i = 0; i < output.entries.size(); ++i) { + output.entries[i].add_newline = true; + } + } + return output; } diff --git a/tools/mtmd/mtmd-image.h b/tools/mtmd/mtmd-image.h index 91a5bc253ef8..f458e39e7641 100644 --- a/tools/mtmd/mtmd-image.h +++ b/tools/mtmd/mtmd-image.h @@ -8,6 +8,24 @@ #define MTMD_INTERNAL_HEADER +struct mtmd_image_preproc_out { + std::vector entries; + // grid size is required for llava-uhd style models + + clip_image_f32 overview; // overview image (downscaled image) + int grid_x = 0; + int grid_y = 0; + + void append(const clip_hparams & hparams, const clip_image_u8 & img, bool normalized = true); + void append(const clip_hparams & hparams, const std::vector & imgs, bool normalized = true); + void append(const clip_hparams & hparams, clip_image_f32 & img, bool normalized = true); + + void append_overview(const clip_hparams & hparams, const clip_image_u8 & img, bool normalized = true); + bool has_overview() const { + return overview.nx() > 0 || overview.ny() > 0; + } +}; + // base class, models must inherit from this class struct mtmd_image_preprocessor { const clip_hparams & hparams; @@ -15,10 +33,7 @@ struct mtmd_image_preprocessor { mtmd_image_preprocessor(const clip_ctx * ctx): hparams(*clip_get_hparams(ctx)) {} virtual ~mtmd_image_preprocessor() = default; - virtual bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) = 0; - - void img_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]); - void img_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst); + virtual mtmd_image_preproc_out preprocess(const clip_image_u8 & img) = 0; }; /** @@ -39,10 +54,12 @@ struct mtmd_image_preprocessor { * [overview] --> [slice 1] --> [slice 2] * | | * +--> [slice 3] --> [slice 4] + * + * NOTE: for the ordering of overview, set "ov_img_first" on the mtmd_context */ struct mtmd_image_preprocessor_llava_uhd : mtmd_image_preprocessor { mtmd_image_preprocessor_llava_uhd(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {} - bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override; + mtmd_image_preproc_out preprocess(const clip_image_u8 & img) override; struct slice_coordinates { int x; @@ -60,7 +77,11 @@ struct mtmd_image_preprocessor_llava_uhd : mtmd_image_preprocessor { // LFM2 override this function to implement its custom slicing logic virtual slice_instructions get_slice_instructions(const clip_image_size & original_size); - std::vector slice_image(const clip_image_u8 & img, const slice_instructions & inst, bool overview_first = true); + struct slice_output { + clip_image_u8 overview; + std::vector slices; + }; + slice_output slice_image(const clip_image_u8 & img, const slice_instructions & inst); private: clip_image_size get_best_resize(const clip_image_size & original_size, int scale_resolution, int patch_size, bool allow_upscale = false); @@ -91,7 +112,7 @@ struct mtmd_image_preprocessor_llava_uhd : mtmd_image_preprocessor { // downscale or upscale the input image to fixed size struct mtmd_image_preprocessor_fixed_size : mtmd_image_preprocessor { mtmd_image_preprocessor_fixed_size(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {} - bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override; + mtmd_image_preproc_out preprocess(const clip_image_u8 & img) override; }; // resize image to multiple of patch_size*n_merge, while preserving aspect ratio @@ -99,13 +120,13 @@ struct mtmd_image_preprocessor_fixed_size : mtmd_image_preprocessor { // this is used by models with native support for dynamic image size, for example: Qwen-VL, Pixtral, Kimi-VL, etc struct mtmd_image_preprocessor_dyn_size : mtmd_image_preprocessor { mtmd_image_preprocessor_dyn_size(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {} - bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override; + mtmd_image_preproc_out preprocess(const clip_image_u8 & img) override; }; // similar to mtmd_image_preprocessor_dyn_size, but resize the image to have longest edge equal to hparams.image_longest_edge, while preserving aspect ratio struct mtmd_image_preprocessor_longest_edge : mtmd_image_preprocessor { mtmd_image_preprocessor_longest_edge(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {} - bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override; + mtmd_image_preproc_out preprocess(const clip_image_u8 & img) override; }; // custom llava-uhd slicing logic for LFM2 @@ -131,17 +152,17 @@ struct mtmd_image_preprocessor_lfm2 : mtmd_image_preprocessor_llava_uhd { struct mtmd_image_preprocessor_idefics3 : mtmd_image_preprocessor_llava_uhd { mtmd_image_preprocessor_idefics3(const clip_ctx * ctx) : mtmd_image_preprocessor_llava_uhd(ctx) {} - bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override; + mtmd_image_preproc_out preprocess(const clip_image_u8 & img) override; }; struct mtmd_image_preprocessor_internvl : mtmd_image_preprocessor_llava_uhd { mtmd_image_preprocessor_internvl(const clip_ctx * ctx) : mtmd_image_preprocessor_llava_uhd(ctx) {} - bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override; + mtmd_image_preproc_out preprocess(const clip_image_u8 & img) override; }; struct mtmd_image_preprocessor_deepseekocr : mtmd_image_preprocessor { mtmd_image_preprocessor_deepseekocr(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {} - bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override; + mtmd_image_preproc_out preprocess(const clip_image_u8 & img) override; }; // DeepSeek-OCR-2: a 1024x1024 global view, plus InternVL-style 768x768 local @@ -153,7 +174,7 @@ struct mtmd_image_preprocessor_deepseekocr2 : mtmd_image_preprocessor { static constexpr int max_tiles = 6; mtmd_image_preprocessor_deepseekocr2(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {} - bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override; + mtmd_image_preproc_out preprocess(const clip_image_u8 & img) override; private: static std::vector get_target_ratios(); @@ -168,7 +189,7 @@ struct mtmd_image_preprocessor_deepseekocr2 : mtmd_image_preprocessor { // ref: https://huggingface.co/stepfun-ai/Step3-VL-10B/blob/main/processing_step3.py struct mtmd_image_preprocessor_step3vl : mtmd_image_preprocessor_llava_uhd { mtmd_image_preprocessor_step3vl(const clip_ctx * ctx) : mtmd_image_preprocessor_llava_uhd(ctx) {} - bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override; + mtmd_image_preproc_out preprocess(const clip_image_u8 & img) override; static slice_instructions build_slice_instructions(const clip_hparams & params, const clip_image_size & prepared_size); private: @@ -195,5 +216,11 @@ struct mtmd_image_preprocessor_step3vl : mtmd_image_preprocessor_llava_uhd { struct mtmd_image_preprocessor_youtuvl : mtmd_image_preprocessor { mtmd_image_preprocessor_youtuvl(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {} - bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override; + mtmd_image_preproc_out preprocess(const clip_image_u8 & img) override; +}; + +// similar to llava_uhd, but has add_newline +struct mtmd_image_preprocessor_granite : mtmd_image_preprocessor_llava_uhd { + mtmd_image_preprocessor_granite(const clip_ctx * ctx) : mtmd_image_preprocessor_llava_uhd(ctx) {} + mtmd_image_preproc_out preprocess(const clip_image_u8 & img) override; }; diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 4140a3c4aa03..724538b5857a 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -69,8 +69,8 @@ struct mtmd_bitmap { return data.size(); } - bool can_batch_with(const mtmd_bitmap & other) const { - // [QWEN_VIDEO] can batch if both are images with same size + bool can_merge_with(const mtmd_bitmap & other) const { + // [QWEN_VIDEO] can (temporal) merge if both are images with same size return !is_audio && !other.is_audio && nx == other.nx && ny == other.ny; } @@ -90,12 +90,23 @@ struct mtmd_image_tokens { uint32_t ny = 0; // number of tokens in y direction mtmd_pos_type pos = MTMD_POS_TYPE_NORMAL; uint32_t image_idx = 0; // 0-based position of this image among image chunks in the prompt(used by pos == MTMD_POS_TYPE_HUNYUANVL) + uint32_t n_temporal_merge = 1; // for qwen-vl style temporal merge uint32_t n_tokens() const { if (pos == MTMD_POS_TYPE_HUNYUANVL) { // [BOI] [row0 tokens + newline] ... [row(ny-1) tokens + newline] [EOI] return (nx + 1) * ny + 2; } - return nx * ny; + uint32_t nz = batch_f32.entries.size(); + if (n_temporal_merge > 1) { + // [QWEN_VIDEO] this logic is quite ugly, it's mostly to make qwen-vl temporal merge work, can be improved in the future + // TODO: simplify this by repeating the last frame until it fits the temporal merge + if (nz % n_temporal_merge != 0) { + nz = nz / n_temporal_merge + 1; + } else { + nz = nz / n_temporal_merge; + } + } + return nx * ny * nz; } clip_image_f32_batch batch_f32; // preprocessed image patches std::string id; // optional user-defined ID, useful for KV cache tracking @@ -103,19 +114,24 @@ struct mtmd_image_tokens { // true if one of entries in batch_f32 is a placeholder bool is_placeholder() const { for (const auto & entry : batch_f32.entries) { - if (entry->is_placeholder()) { + if (entry.is_placeholder()) { return true; } } return false; } + bool can_batch_with(const mtmd_image_tokens & other) { + return nx == other.nx && ny == other.ny && pos == other.pos; + } + mtmd_image_tokens clone() { return mtmd_image_tokens{ nx, ny, pos, image_idx, + n_temporal_merge, batch_f32.clone(), id }; @@ -131,7 +147,7 @@ struct mtmd_audio_tokens { // true if one of entries in batch_f32 is a placeholder bool is_placeholder() const { for (const auto & entry : batch_f32.entries) { - if (entry->is_placeholder()) { + if (entry.is_placeholder()) { return true; } } @@ -153,12 +169,49 @@ struct mtmd_input_chunk { std::vector tokens_text; mtmd_image_tokens_ptr tokens_image; mtmd_audio_tokens_ptr tokens_audio; + + bool can_batch_with(const mtmd_input_chunk & other) const { + if (type != other.type) { + return false; + } + + if (tokens_image && other.tokens_image) { + return tokens_image->can_batch_with(*other.tokens_image); + } + + // TODO: allow batching audio chunks of the same size + + return false; + } + + bool is_placeholder() const { + if (type == MTMD_INPUT_CHUNK_TYPE_IMAGE) { + return tokens_image && tokens_image->is_placeholder(); + } else if (type == MTMD_INPUT_CHUNK_TYPE_AUDIO) { + return tokens_audio && tokens_audio->is_placeholder(); + } + return false; + } }; struct mtmd_input_chunks { std::vector entries; }; +struct mtmd_batch { + mtmd_context * ctx; + std::vector entries; + std::vector output_embd; // aggregated output embedding for the whole batch + mtmd_batch(mtmd_context * ctx): ctx(ctx) {} + int32_t n_tokens() const { + int32_t n = 0; + for (const auto * chunk : entries) { + n += mtmd_input_chunk_get_n_tokens(chunk); + } + return n; + } +}; + // slice template, used by some llava-uhd models to correctly place the special tokens around image embeddings // models not having it (llava-1.6) will process embeddings without any special tokens in-between enum mtmd_slice_tmpl { @@ -197,6 +250,9 @@ mtmd_context_params mtmd_context_params_default() { /* image_max_tokens */ -1, /* cb_eval */ nullptr, /* cb_eval_user_data */ nullptr, + /* batch_max_tokens */ 1024, + /* progress_callback */ nullptr, + /* progress_callback_user_data */ nullptr, }; return params; } @@ -204,7 +260,7 @@ mtmd_context_params mtmd_context_params_default() { struct mtmd_context { struct clip_ctx * ctx_v; // vision struct clip_ctx * ctx_a; // audio - std::vector image_embd_v; // image embedding vector + std::vector out_embd; // image embedding vector bool print_timings; int n_threads; @@ -239,17 +295,21 @@ struct mtmd_context { std::unique_ptr audio_preproc; std::unique_ptr image_preproc; + // batching + int32_t batch_max_tokens; + // TODO @ngxson : add timings mtmd_context(const char * mmproj_fname, const llama_model * text_model, const mtmd_context_params & ctx_params, bool no_alloc = false) : - print_timings(ctx_params.print_timings), - n_threads (ctx_params.n_threads), - media_marker (ctx_params.media_marker), - n_embd_text (text_model ? llama_model_n_embd_inp(text_model) : -1), - vocab (text_model ? llama_model_get_vocab(text_model) : nullptr) + print_timings (ctx_params.print_timings), + n_threads (ctx_params.n_threads), + media_marker (ctx_params.media_marker), + n_embd_text (text_model ? llama_model_n_embd_inp(text_model) : -1), + vocab (text_model ? llama_model_get_vocab(text_model) : nullptr), + batch_max_tokens(ctx_params.batch_max_tokens) { if (ctx_params.image_marker != nullptr) { throw std::runtime_error("custom image_marker is not supported anymore, use media_marker instead"); @@ -287,6 +347,8 @@ struct mtmd_context { /* cb_eval */ ctx_params.cb_eval, /* cb_eval_user_data */ ctx_params.cb_eval_user_data, /* no_alloc */ no_alloc, + /* progress_callback */ ctx_params.progress_callback, + /* progress_callback_user_data */ ctx_params.progress_callback_user_data, }; auto res = clip_init(mmproj_fname, ctx_clip_params); @@ -458,6 +520,7 @@ struct mtmd_context { LOG_WRN("%s: llama 4 vision is known to have degraded quality:\n" " https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__); image_preproc = std::make_unique(ctx_v); + ov_img_first = false; } break; case PROJECTOR_TYPE_STEP3VL: { @@ -481,6 +544,7 @@ struct mtmd_context { img_beg = ""; img_end = ""; image_preproc = std::make_unique(ctx_v); + ov_img_first = false; } break; case PROJECTOR_TYPE_KIMIVL: { @@ -557,11 +621,13 @@ struct mtmd_context { { img_end = "\n"; // prevent empty batch on llama-server image_preproc = std::make_unique(ctx_v); + ov_img_first = false; } break; case PROJECTOR_TYPE_DEEPSEEKOCR2: { img_end = "\n"; // prevent empty batch on llama-server image_preproc = std::make_unique(ctx_v); + ov_img_first = false; } break; case PROJECTOR_TYPE_HUNYUANVL: { @@ -581,7 +647,8 @@ struct mtmd_context { { img_beg = ""; img_end = ""; - image_preproc = std::make_unique(ctx_v); + image_preproc = std::make_unique(ctx_v); + ov_img_first = true; } break; default: throw std::runtime_error(string_format("%s: unexpected vision projector type %d\n", __func__, proj)); @@ -680,6 +747,16 @@ struct mtmd_context { return ctx_a ? clip_get_projector_type(ctx_a) : PROJECTOR_TYPE_UNKNOWN; } + int64_t n_embd_out() const { + if (ctx_v) { + return clip_n_mmproj_embd(ctx_v); + } else if (ctx_a) { + return clip_n_mmproj_embd(ctx_a); + } else { + throw std::runtime_error("no CLIP model loaded"); + } + } + ~mtmd_context() { clip_free(ctx_a); clip_free(ctx_v); @@ -845,7 +922,7 @@ struct mtmd_tokenizer { // [QWEN_VIDEO] handle frame merging for models that support it (i.e. qwen-vl) int n_merge_frames = 1; if (ctx->ctx_v) { - n_merge_frames = clip_model_n_batch_max(ctx->ctx_v); + n_merge_frames = clip_model_n_temporal_merge(ctx->ctx_v); GGML_ASSERT(n_merge_frames <= 2 && "we only support merging maximum 2 images for now; open an issue if this model supports merging more"); } @@ -860,7 +937,7 @@ struct mtmd_tokenizer { if (i + 1 < parts.size() && parts[i + 1].bitmap != nullptr) { const mtmd_bitmap * bm_a = parts[i].bitmap; const mtmd_bitmap * bm_b = parts[i + 1].bitmap; - if (bm_a->can_batch_with(*bm_b)) { + if (bm_a->can_merge_with(*bm_b)) { LOG_DBG("%s: merging 2 frames at part index %zu and %zu\n", __func__, i, i + 1); merged_bitmaps.push_back({bm_a, bm_b}); parts.erase(parts.begin() + i + 1); // collapse the second bitmap part @@ -965,7 +1042,10 @@ struct mtmd_tokenizer { int32_t add_media(std::vector & bitmaps) { GGML_ASSERT(!bitmaps.empty()); - if (!bitmaps[0]->is_audio) { + // note: only one type of media is supported per call, caller should enforce this + const bool is_vision = !bitmaps[0]->is_audio; + + if (is_vision) { // handle image if (!ctx->ctx_v) { @@ -979,7 +1059,7 @@ struct mtmd_tokenizer { // TODO @ngxson : this is quite hacky because preprocessor only support batch with one single element, that need to be fixed in the future (e.g. by changing the preprocessor interface always take single input) - clip_image_f32_batch batch_f32; + mtmd_image_preproc_out preproc_out; for (const auto * bmp : bitmaps) { // sanity check @@ -992,66 +1072,54 @@ struct mtmd_tokenizer { } // convert mtmd_bitmap to clip_image_u8 - clip_image_u8_ptr img_u8(clip_image_u8_init()); - img_u8->set_size( + clip_image_u8 img_u8; + img_u8.set_size( {(int)bmp->nx, (int)bmp->ny}, bmp->is_placeholder()); - img_u8->cpy_buf(bmp->get_ro_buf()); + img_u8.cpy_buf(bmp->get_ro_buf()); // preprocess image - clip_image_f32_batch tmp_batch; - bool ok = ctx->image_preproc->preprocess(*img_u8, tmp_batch); - if (!ok) { - LOG_ERR("Unable to preprocess image\n"); - return 2; - } + mtmd_image_preproc_out tmp_preproc_out = ctx->image_preproc->preprocess(img_u8); - // move entries and grid dimensions to the "global" batch_f32 - for (auto & entry : tmp_batch.entries) { - batch_f32.entries.emplace_back(std::move(entry)); + // move entries and grid dimensions to the "global" preproc_out + for (auto & entry : tmp_preproc_out.entries) { + preproc_out.entries.emplace_back(std::move(entry)); } // for llava-uhd style, we need to handle grid too - // we don't care about overwriting these values for now because llama-uhd doesn't support batching anyway - batch_f32.grid_x = tmp_batch.grid_x; - batch_f32.grid_y = tmp_batch.grid_y; - } - - // Annotate llava-next style tiles so clip_n_output_tokens accounts - // for per-tile newline injection. - if (ctx->proj_type_v() == PROJECTOR_TYPE_GRANITE4_VISION) { - if (batch_f32.entries.size() == 1) { - // Single-tile (overview only): append one newline row. - batch_f32.entries[0]->add_newline = true; - } else { - // Multi-tile: overview gets no newline, grid tiles get one. - batch_f32.entries[0]->add_newline = false; - for (size_t i = 1; i < batch_f32.entries.size(); ++i) { - batch_f32.entries[i]->add_newline = true; - } + // we don't care about overwriting these values for now because the case where bitmaps.size() > 1 is only for frame merging (qwen-vl), not supported by llava-uhd + if ((tmp_preproc_out.grid_x > 0 && tmp_preproc_out.grid_y > 0) + || tmp_preproc_out.has_overview()) { + GGML_ASSERT(bitmaps.size() == 1); + preproc_out.grid_x = tmp_preproc_out.grid_x; + preproc_out.grid_y = tmp_preproc_out.grid_y; + preproc_out.overview = std::move(tmp_preproc_out.overview); } } + LOG_DBG("%s: preproc_out has %zu entries, grid_x = %d, grid_y = %d, has_overview = %d\n", + __func__, preproc_out.entries.size(), preproc_out.grid_x, preproc_out.grid_y, + preproc_out.has_overview() ? 1 : 0); + // handle llava-uhd style preprocessing - const bool has_tiling_grid = batch_f32.grid_x > 0 && batch_f32.grid_y > 0; - if ( - ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5 - || ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6 - || ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4 - || ctx->slice_tmpl == MTMD_SLICE_TMPL_IDEFICS3 - || ctx->slice_tmpl == MTMD_SLICE_TMPL_STEP3VL - || (ctx->slice_tmpl == MTMD_SLICE_TMPL_LFM2 && has_tiling_grid) - ) { + // (output either a grid, or overview-only) + const bool has_tiling_grid = (preproc_out.grid_x > 0 && preproc_out.grid_y > 0) + || preproc_out.has_overview(); + + if (has_tiling_grid) { // [QWEN_VIDEO] we do not support "frame merging" for llama-uhd style, so no batching for now GGML_ASSERT(bitmaps.size() == 1); - const int n_col = batch_f32.grid_x; - const int n_row = batch_f32.grid_y; + const int n_col = preproc_out.grid_x; + const int n_row = preproc_out.grid_y; + // split batch into chunks of single images - // NOTE: batch_f32 will be invalidated after this call - auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmaps[0]->id); + auto chunks = split_batch_to_chunk(std::move(preproc_out), bitmaps[0]->id); GGML_ASSERT(chunks.size() > 0); + // NOTE: preproc_out is invalidated after this point, do not use it anymore + + // split_batch_to_chunk must always put the overview image first auto ov_chunk = std::move(chunks.front()); chunks.erase(chunks.begin()); @@ -1078,7 +1146,16 @@ struct mtmd_tokenizer { std::snprintf(buf.get(), sz, ctx->sli_img_start_tmpl.c_str(), y+1, x+1); add_text(std::string(buf.get(), buf.get() + sz - 1), true); } - cur.entries.emplace_back(std::move(chunks[y * n_col + x])); + + auto & curr_chunk = chunks[y * n_col + x]; + auto & curr_batch = curr_chunk.tokens_image->batch_f32; + if (curr_batch.entries.size() != 1) { + throw std::runtime_error(string_format("%s: expect 1 image in batch_f32", __func__)); + } + + LOG_DBG("%s: adding slice image at row %d col %d\n", __func__, y, x); + cur.entries.emplace_back(std::move(curr_chunk)); + add_text(ctx->tok_sli_img_end); if (!is_last_in_row) { add_text(ctx->tok_sli_img_mid); @@ -1100,20 +1177,29 @@ struct mtmd_tokenizer { } else { + if (preproc_out.entries.size() == 0) { + LOG_ERR("%s: no image tokens produced by preprocessor (ref: https://github.com/ggml-org/llama.cpp/pull/24769)\n", __func__); + return 2; + } + size_t n_tokens = 0; - for (const auto & e : batch_f32.entries) { - n_tokens += clip_n_output_tokens(ctx->ctx_v, e.get()); - if (clip_model_n_batch_max(ctx->ctx_v) == 2) { + for (auto & e : preproc_out.entries) { + n_tokens += clip_n_output_tokens(ctx->ctx_v, &e); + if (clip_model_n_temporal_merge(ctx->ctx_v) == 2) { // [QWEN_VIDEO] pair input is merged to the same embd, so only count as one image break; } } mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens); + + // [QWEN_VIDEO] improve this in the future + image_tokens->n_temporal_merge = clip_model_n_temporal_merge(ctx->ctx_v); + if (mtmd_decode_use_mrope(ctx)) { // for Qwen2VL, we need this information for M-RoPE decoding positions - image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_v, batch_f32.entries[0].get()); - image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_v, batch_f32.entries[0].get()); + image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_v, &preproc_out.entries[0]); + image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_v, &preproc_out.entries[0]); } else { // other models, we only need the total number of tokens image_tokens->nx = n_tokens; @@ -1128,6 +1214,12 @@ struct mtmd_tokenizer { image_tokens->image_idx = n_images_added; GGML_ASSERT(n_tokens == (size_t)image_tokens->n_tokens()); } + + clip_image_f32_batch batch_f32; + batch_f32.is_audio = false; + batch_f32.entries = std::move(preproc_out.entries); + // do NOT use preproc_out from this point on, it's moved + image_tokens->batch_f32 = std::move(batch_f32); image_tokens->id = bitmaps[0]->id; // optional @@ -1207,13 +1299,16 @@ struct mtmd_tokenizer { for (auto & mel_spec : mel_spec_chunks) { const bool is_placeholder = mel_spec.data.empty(); - clip_image_f32_ptr mel_f32(clip_image_f32_init()); - mel_f32->set_size( - {mel_spec.n_len, mel_spec.n_mel}, + // Validate dimensions fit in clip_image_size (int) + GGML_ASSERT(mel_spec.n_len <= INT32_MAX && mel_spec.n_len >= 0); + GGML_ASSERT(mel_spec.n_mel <= INT32_MAX && mel_spec.n_mel >= 0); + clip_image_f32 mel_f32; + mel_f32.set_size( + {(int)mel_spec.n_len, (int)mel_spec.n_mel}, is_placeholder, /* is_audio */ true); - mel_f32->cpy_buf(mel_spec.data); + mel_f32.cpy_buf(mel_spec.data); - size_t n_tokens = clip_n_output_tokens(ctx->ctx_a, mel_f32.get()); + size_t n_tokens = clip_n_output_tokens(ctx->ctx_a, &mel_f32); clip_image_f32_batch batch_f32; batch_f32.is_audio = true; @@ -1243,16 +1338,18 @@ struct mtmd_tokenizer { return 0; } - std::vector split_batch_to_chunk(clip_image_f32_batch && batch_f32, const std::string & id) { + std::vector split_batch_to_chunk(mtmd_image_preproc_out && preproc_out, const std::string & id) { std::vector chunks; - for (auto & entry : batch_f32.entries) { + auto process_chunk = [&](clip_image_f32 && img) { mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens); - image_tokens->nx = clip_n_output_tokens(ctx->ctx_v, entry.get()); + image_tokens->nx = clip_n_output_tokens(ctx->ctx_v, &img); image_tokens->ny = 1; - image_tokens->batch_f32.entries.push_back(std::move(entry)); + image_tokens->batch_f32.entries.push_back(std::move(img)); image_tokens->id = id; + GGML_ASSERT(image_tokens->nx > 0); + mtmd_input_chunk chunk{ MTMD_INPUT_CHUNK_TYPE_IMAGE, {}, // text tokens @@ -1260,6 +1357,21 @@ struct mtmd_tokenizer { nullptr, // audio tokens }; chunks.emplace_back(std::move(chunk)); + }; + + // overview image first + auto & overview = preproc_out.overview; + if (overview.nx() == 0 || overview.ny() == 0) { + throw std::runtime_error(string_format("%s: invalid overview image for llava-uhd style preprocessing\n", __func__)); + } + process_chunk(std::move(preproc_out.overview)); + + // then, process slices + for (auto & entry : preproc_out.entries) { + if (entry.nx() == 0 || entry.ny() == 0) { + throw std::runtime_error(string_format("%s: invalid image slice for llava-uhd style preprocessing\n", __func__)); + } + process_chunk(std::move(entry)); } return chunks; @@ -1327,7 +1439,32 @@ int32_t mtmd_tokenize(mtmd_context * ctx, } } -int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) { +static int32_t mtmd_encode_impl(mtmd_context * ctx, const mtmd_image_tokens * image_tokens, std::vector & out_embd) { + clip_ctx * ctx_clip = ctx->ctx_v; + if (!ctx_clip) { + LOG_ERR("%s: this API does not support non-vision input, please use mtmd_encode_chunk instead\n", __func__); + return 1; + } + + int n_embd_out = ctx->n_embd_out(); + auto n_tokens_out = image_tokens->n_tokens(); + out_embd.resize((size_t)n_embd_out * n_tokens_out); + + if (image_tokens->is_placeholder()) { + LOG_ERR("%s: image tokens batch is placeholder\n", __func__); + return 1; + } + + bool ok = clip_image_batch_encode( + ctx_clip, + ctx->n_threads, + &image_tokens->batch_f32, + out_embd); + + return ok ? 0 : 1; +} + +static int32_t mtmd_encode_chunk_impl(mtmd_context * ctx, const mtmd_input_chunk * chunk, std::vector & out_embd) { if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) { LOG_WRN("mtmd_encode_chunk has no effect for text chunks\n"); return 0; @@ -1344,7 +1481,7 @@ int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) { LOG_ERR("%s: image tokens batch is placeholder\n", __func__); return 1; } - return mtmd_encode(ctx, chunk->tokens_image.get()); + return mtmd_encode_impl(ctx, chunk->tokens_image.get(), out_embd); } else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) { if (!ctx->ctx_a) { LOG_ERR("%s: model does not support audio input\n", __func__); @@ -1358,13 +1495,13 @@ int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) { LOG_ERR("%s: audio tokens batch is placeholder\n", __func__); return 1; } - int n_mmproj_embd = ctx->n_embd_text; - ctx->image_embd_v.resize(chunk->tokens_audio->n_tokens * n_mmproj_embd); + int n_mmproj_embd = ctx->n_embd_out(); + out_embd.resize((size_t)chunk->tokens_audio->n_tokens * n_mmproj_embd); bool ok = clip_image_batch_encode( ctx->ctx_a, ctx->n_threads, &chunk->tokens_audio->batch_f32, - ctx->image_embd_v.data()); + out_embd); return ok ? 0 : 1; } @@ -1372,58 +1509,155 @@ int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) { return 1; } +int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) { + // this is the non-batching version + try { + return mtmd_encode_chunk_impl(ctx, chunk, ctx->out_embd); + } catch (const std::exception & e) { + LOG_ERR("%s: error: %s\n", __func__, e.what()); + return 1; + } +} + int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) { - clip_ctx * ctx_clip = ctx->ctx_v; - if (!ctx_clip) { - LOG_ERR("%s: this API does not support non-vision input, please use mtmd_encode_chunk instead\n", __func__); + try { + return mtmd_encode_impl(ctx, image_tokens, ctx->out_embd); + } catch (const std::exception & e) { + LOG_ERR("%s: error: %s\n", __func__, e.what()); return 1; } - auto proj_type = clip_get_projector_type(ctx_clip); - int n_mmproj_embd = clip_n_mmproj_embd(ctx_clip); - ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd); - bool ok = false; - - if (clip_is_llava(ctx_clip) - || proj_type == PROJECTOR_TYPE_MINICPMV - || proj_type == PROJECTOR_TYPE_GLM_EDGE - || proj_type == PROJECTOR_TYPE_INTERNVL - || proj_type == PROJECTOR_TYPE_DEEPSEEKOCR2 - || proj_type == PROJECTOR_TYPE_GRANITE4_VISION) { - // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode() - const auto & entries = image_tokens->batch_f32.entries; - // entries may have different token counts - // e.g., DeepSeek-OCR-2: 144 per tile views, 257 for the global view - size_t offset = 0; - for (size_t i = 0; i < entries.size(); i++) { - if (entries[i]->is_placeholder()) { - LOG_ERR("%s: image tokens batch entry %zu is placeholder\n", __func__, i); - return 1; +} + +float * mtmd_get_output_embd(mtmd_context * ctx) { + return ctx->out_embd.data(); +} + +mtmd_batch * mtmd_batch_init(mtmd_context * ctx) { + return new mtmd_batch(ctx); +} + +void mtmd_batch_free(mtmd_batch * batch) { + if (batch) { + delete batch; + } +} + +int32_t mtmd_batch_add_chunk(mtmd_batch * batch, const mtmd_input_chunk * chunk) { + if (chunk->type == MTMD_INPUT_CHUNK_TYPE_TEXT) { + LOG_ERR("%s: text chunk is not supported in batch\n", __func__); + return 1; + } + + auto * ctx = batch->ctx->get_clip_ctx(chunk); + if (!ctx) { + LOG_ERR("%s: model does not support input chunk type %d\n", __func__, (int)chunk->type); + return 1; + } + + if (batch->entries.empty()) { + // batch must have at least one chunk + batch->entries.push_back(chunk); + return 0; + } + + if (!clip_support_batch(ctx)) { + // if no batching support, batch can only have one single chunk + return 2; // "batch too large" error code + } + + int32_t new_n_tokens = batch->n_tokens() + (int32_t)mtmd_input_chunk_get_n_tokens(chunk); + if (new_n_tokens > batch->ctx->batch_max_tokens) { + return 2; // "batch too large" error code + } + + auto & first_chunk = batch->entries[0]; + if (first_chunk->can_batch_with(*chunk)) { + batch->entries.push_back(chunk); + return 0; + } + + return 3; // "cannot batch" error code +} + +static int32_t mtmd_batch_encode_impl(mtmd_batch * batch) { + if (batch->entries.empty()) { + LOG_ERR("%s: batch is empty\n", __func__); + return 1; + } + for (const auto * chunk : batch->entries) { + if (chunk->is_placeholder()) { + LOG_ERR("%s: chunk is placeholder\n", __func__); + return 1; + } + } + + // represent the whole batch as one single chunk + mtmd::input_chunk_ptr batch_chunk(mtmd_input_chunk_copy(batch->entries[0])); + if (batch_chunk->tokens_image) { + auto & b0_f32 = batch_chunk->tokens_image->batch_f32; + // copy all entries from other chunks into the first chunk's batch_f32 + // note: skip first entry because it's already in batch_chunk + for (size_t ic = 1; ic < batch->entries.size(); ic++) { + auto & chunk = batch->entries[ic]; + GGML_ASSERT(chunk->tokens_image); + auto b1_f32 = chunk->tokens_image->batch_f32.clone(); + for (size_t i = 0; i < b1_f32.entries.size(); i++) { + b0_f32.entries.push_back(std::move(b1_f32.entries[i])); } - int n_tokens_per_image = clip_n_output_tokens(ctx_clip, entries[i].get()); - ok = clip_image_encode( - ctx_clip, - ctx->n_threads, - entries[i].get(), - ctx->image_embd_v.data() + offset); - offset += static_cast(n_mmproj_embd) * n_tokens_per_image; } - } else { - if (image_tokens->is_placeholder()) { - LOG_ERR("%s: image tokens batch is placeholder\n", __func__); - return 1; + } else if (batch_chunk->tokens_audio) { + auto & b0_f32 = batch_chunk->tokens_audio->batch_f32; + // copy all entries from other chunks into the first chunk's batch_f32 + // note: skip first entry because it's already in batch_chunk + for (size_t ic = 1; ic < batch->entries.size(); ic++) { + auto & chunk = batch->entries[ic]; + GGML_ASSERT(chunk->tokens_audio); + auto b1_f32 = chunk->tokens_audio->batch_f32.clone(); + for (size_t i = 0; i < b1_f32.entries.size(); i++) { + b0_f32.entries.push_back(std::move(b1_f32.entries[i])); + } } - ok = clip_image_batch_encode( - ctx_clip, - ctx->n_threads, - &image_tokens->batch_f32, - ctx->image_embd_v.data()); + } else { + LOG_ERR("%s: unsupported chunk type\n", __func__); + return 1; } - return ok ? 0 : 1; + LOG_DBG("%s: encoding batch with %zu entries and total %zu tokens\n", + __func__, batch->entries.size(), mtmd_input_chunk_get_n_tokens(batch_chunk.get())); + int32_t res = mtmd_encode_chunk_impl( + batch->ctx, + batch_chunk.get(), + batch->output_embd); + return res; } -float * mtmd_get_output_embd(mtmd_context * ctx) { - return ctx->image_embd_v.data(); +int32_t mtmd_batch_encode(mtmd_batch * batch) { + try { + return mtmd_batch_encode_impl(batch); + } catch (const std::exception & e) { + LOG_ERR("%s: error: %s\n", __func__, e.what()); + return 1; + } +} + +float * mtmd_batch_get_output_embd(mtmd_batch * batch, const mtmd_input_chunk * chunk) { + if (batch->output_embd.empty()) { + LOG_ERR("%s: batch has not been encoded yet\n", __func__); + return nullptr; + } + size_t offset = 0; + const size_t n_embd = batch->ctx->n_embd_out(); + for (const auto * c : batch->entries) { + size_t offset_prev = offset; + size_t n_tokens = mtmd_input_chunk_get_n_tokens(c); + offset += n_tokens * n_embd; + GGML_ASSERT(offset_prev < batch->output_embd.size()); + GGML_ASSERT(offset <= batch->output_embd.size()); + if (c == chunk) { + return &batch->output_embd.data()[offset_prev]; + } + } + return nullptr; // not found } bool mtmd_decode_use_non_causal(const mtmd_context * ctx, const mtmd_input_chunk * chunk) { @@ -1801,7 +2035,7 @@ static void mtmd_debug_encode_impl(mtmd_context * ctx, clip_ctx * ctx_clip, clip ctx_clip, ctx->n_threads, &image, - embd_output.data()); + embd_output); if (!ok) { LOG_ERR("%s: failed to encode image\n", __func__); } @@ -1853,16 +2087,18 @@ void mtmd_debug_preprocess_image(mtmd_context * ctx, const std::vector clip_image_u8 img_u8; img_u8.set_size({nx, ny}, false); img_u8.cpy_buf(rgb_values); - clip_image_f32_batch batch_f32; GGML_ASSERT(ctx->image_preproc != nullptr); - bool ok = ctx->image_preproc->preprocess(img_u8, batch_f32); - if (!ok) { - LOG_ERR("%s: failed to preprocess image\n", __func__); - return; + mtmd_image_preproc_out preproc_out = ctx->image_preproc->preprocess(img_u8); + + clip_image_f32_batch batch_f32; + batch_f32.is_audio = false; + for (auto & entry : preproc_out.entries) { + batch_f32.entries.push_back(std::move(entry)); } + LOG_INF("%s: preprocessed image to batch_f32 with %d entries\n", __func__, (int)batch_f32.entries.size()); for (size_t i = 0; i < batch_f32.entries.size(); i++) { - LOG_INF("%s: entry %zu has nx=%d, ny=%d\n", __func__, i, batch_f32.entries[i]->nx(), batch_f32.entries[i]->ny()); + LOG_INF("%s: entry %zu has nx=%d, ny=%d\n", __func__, i, batch_f32.entries[i].nx(), batch_f32.entries[i].ny()); // TODO: better way to dump entry content? } } @@ -1901,9 +2137,12 @@ std::map mtmd_get_memory_usage(const char * mmproj_f mtmd::context_ptr ctx; auto saved_log_callback = g_logger_state.log_callback; auto saved_log_user_data = g_logger_state.log_callback_user_data; + + ctx_params.progress_callback = nullptr; + try { mtmd_log_set(stub_log_callback, nullptr); // suppress logging - ctx.reset(new mtmd_context(mmproj_fname, nullptr, ctx_params)); + ctx.reset(new mtmd_context(mmproj_fname, nullptr, ctx_params, true)); mtmd_log_set(saved_log_callback, saved_log_user_data); // restore log callback std::map total_mem; auto merge = [&](const struct clip_ctx * c) { diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h index a76a6ec2b882..25d51ef58d41 100644 --- a/tools/mtmd/mtmd.h +++ b/tools/mtmd/mtmd.h @@ -63,6 +63,7 @@ struct mtmd_bitmap; struct mtmd_image_tokens; struct mtmd_input_chunk; struct mtmd_input_chunks; +struct mtmd_batch; struct mtmd_input_text { const char * text; @@ -80,6 +81,9 @@ typedef struct mtmd_image_tokens mtmd_image_tokens; typedef struct mtmd_input_chunk mtmd_input_chunk; typedef struct mtmd_input_chunks mtmd_input_chunks; typedef struct mtmd_input_text mtmd_input_text; +typedef struct mtmd_batch mtmd_batch; + +typedef bool (*mtmd_progress_callback)(float progress, void * user_data); struct mtmd_context_params { bool use_gpu; @@ -97,6 +101,17 @@ struct mtmd_context_params { // callback function passed over to mtmd proper ggml_backend_sched_eval_callback cb_eval; void * cb_eval_user_data; + + // batching params + int32_t batch_max_tokens; // maximum number of output tokens in a batch + // (note: this is not a hard-limit, the first image will always be added even if it exceeds this limit) + // (default: 1024) + + // Called with a progress value between 0.0 and 1.0. Pass NULL to disable. + // If the provided progress_callback returns true, model loading continues. + // If it returns false, model loading is immediately aborted. + mtmd_progress_callback progress_callback; + void * progress_callback_user_data; }; MTMD_API const char * mtmd_default_marker(void); @@ -265,12 +280,12 @@ MTMD_API int32_t mtmd_tokenize(mtmd_context * ctx, const mtmd_bitmap ** bitmaps, size_t n_bitmaps); -// returns 0 on success -// TODO: deprecate -MTMD_API int32_t mtmd_encode(mtmd_context * ctx, - const mtmd_image_tokens * image_tokens); +DEPRECATED(MTMD_API int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens), + "use mtmd_encode_chunk() instead"); +// text chunk will be ignored silently, only media chunk will be encoded // returns 0 on success +// returns 1 on generic error MTMD_API int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk); @@ -279,6 +294,26 @@ MTMD_API int32_t mtmd_encode_chunk(mtmd_context * ctx, // llama_model_n_embd_inp(model) * mtmd_input_chunk_get_n_tokens(chunk) * sizeof(float) MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx); + +// batch encoding API +// chunks are not owned by the batch, they will not be freed by mtmd_batch_free() +// batch is valid for a given context, cannot be shared across contexts +MTMD_API mtmd_batch * mtmd_batch_init(mtmd_context * ctx); +MTMD_API void mtmd_batch_free(mtmd_batch * batch); + +// only media chunks are allowed, text chunks will be rejected +// returns 0 on success +// returns 1 on generic error +// returns 2 if the batch is too large (chunk won't be added) +// returns 3 if it cannot be batched with the existing chunks in the batch +MTMD_API int32_t mtmd_batch_add_chunk(mtmd_batch * batch, const mtmd_input_chunk * chunk); + +// returns 0 on success +// returns 1 on generic error +MTMD_API int32_t mtmd_batch_encode(mtmd_batch * batch); +MTMD_API float * mtmd_batch_get_output_embd(mtmd_batch * batch, const mtmd_input_chunk * chunk); + + // Set callback for all future logging events. // If this is not called, or NULL is supplied, everything is output on stderr. MTMD_API void mtmd_log_set(ggml_log_callback log_callback, void * user_data); @@ -336,6 +371,11 @@ struct mtmd_input_chunk_deleter { }; using input_chunk_ptr = std::unique_ptr; +struct mtmd_batch_deleter { + void operator()(mtmd_batch * val) { mtmd_batch_free(val); } +}; +using batch_ptr = std::unique_ptr; + struct bitmap { bitmap_ptr ptr; bitmap() : ptr(nullptr) {} diff --git a/tools/mtmd/tests.sh b/tools/mtmd/tests.sh index 5da48d61bfd3..6fe26478ab60 100755 --- a/tools/mtmd/tests.sh +++ b/tools/mtmd/tests.sh @@ -13,6 +13,8 @@ mkdir -p $SCRIPT_DIR/output PROJ_ROOT="$SCRIPT_DIR/../.." cd $PROJ_ROOT +export MTMD_TEST_RESPONSE_MARKER="" + # Check if the first argument is "big", then run test with big models # This is useful if we're running the script on a larger machine, so we can test the big models RUN_BIG_TESTS=false @@ -28,6 +30,15 @@ if [ "${1:-}" = "huge" ]; then echo "Include BIG and HUGE models..." fi +USE_VIDEO=false +if [ "${1:-}" = "video" ]; then + USE_VIDEO=true + echo "Using video as input..." + # behavior of USE_VIDEO: + # do NOT check if the output contains "new york", only verify if the exit code is 0 + # when printing the result, print the OK/FAIL line then print the generated text +fi + # Check if the second argument is "flash", then enable flash attention # This is useful to test if flash attention off works correctly FLASH_ATTN="on" @@ -50,13 +61,20 @@ add_test_vision() { if [ $# -gt 0 ]; then extra_args=$(printf " %q" "$@") fi + if [ "$USE_VIDEO" = true ]; then + arr_file+=("test-3.mp4") + else + arr_file+=("test-1.jpeg") + fi arr_prefix+=("[vision]") arr_hf+=("$hf") arr_extra_args+=("$extra_args") - arr_file+=("test-1.jpeg") } add_test_audio() { + if [ "$USE_VIDEO" = true ]; then + return 0 + fi local hf=$1 shift local extra_args="" @@ -166,19 +184,35 @@ for i in "${!arr_hf[@]}"; do cmd+=" -p \"what is the publisher name of the newspaper?\"" fi - output=$(eval "$cmd" 2>&1 | tee /dev/tty) + exit_code=0 + output=$(eval "$cmd" 2>&1 | tee /dev/tty) || exit_code=$? echo "$output" > $SCRIPT_DIR/output/$bin-$(echo "$hf" | tr '/' '-').log - # either contains "new york" or both "men" and "walk" - if echo "$output" | grep -iq "new york" \ - || (echo "$output" | grep -iq "men" && echo "$output" | grep -iq "walk") - then - result="$prefix \033[32mOK\033[0m: $hf" + if [ "$USE_VIDEO" = true ]; then + # for video, only check exit code; do not grep for "new york" + if [ $exit_code -eq 0 ]; then + result="$prefix \033[32mOK\033[0m: $hf" + else + result="$prefix \033[31mFAIL\033[0m: $hf" + fi + # append generated text (after the response marker) + generated_text=$(echo "$output" | sed "1,/${MTMD_TEST_RESPONSE_MARKER}/d" | tail -10) + if [ -n "$generated_text" ]; then + result+="\n$generated_text" + fi + echo -e "$result" else - result="$prefix \033[31mFAIL\033[0m: $hf" + # either contains "new york" or both "men" and "walk" + if echo "$output" | grep -iq "new york" \ + || (echo "$output" | grep -iq "men" && echo "$output" | grep -iq "walk") + then + result="$prefix \033[32mOK\033[0m: $hf" + else + result="$prefix \033[31mFAIL\033[0m: $hf" + fi + echo -e "$result" fi - echo -e "$result" arr_res+=("$result") echo "" diff --git a/tools/mtmd/tests/test-deepseek-ocr.py b/tools/mtmd/tests/test-deepseek-ocr.py index 5f5fef765a62..ec0b4523be9a 100644 --- a/tools/mtmd/tests/test-deepseek-ocr.py +++ b/tools/mtmd/tests/test-deepseek-ocr.py @@ -9,6 +9,7 @@ import argparse import logging +import re import subprocess import sys import unicodedata @@ -28,6 +29,12 @@ class ModelSpec: mmproj_arg: str model_default: str mmproj_default: str + prompt: str = "Free OCR. " + n_predict: int = 512 + n_ctx: int | None = None + # Unlimited-OCR's "document parsing" prompt emits <|det|> grounding markup that + # the HF reference strips in result.md; drop it before scoring to match. + strip_grounding: bool = False @dataclass @@ -63,6 +70,20 @@ def chrf_min(self) -> float: model_default="gguf_models/deepseek-ai/deepseek-ocr-2-bf16.gguf", mmproj_default="gguf_models/deepseek-ai/mmproj-deepseek-ocr-2-bf16.gguf", ), + "unlimited": ModelSpec( + key="unlimited", label="Unlimited-OCR", + model_arg="--llama-model-unlimited", mmproj_arg="--mmproj-unlimited", + model_default="gguf_models/baidu/unlimited-ocr-bf16.gguf", + mmproj_default="gguf_models/baidu/mmproj-unlimited-ocr-bf16.gguf", + # "Free OCR." immediately emits EOS on this checkpoint; the HF reference + # (demo/unlimited_ocr_scores.py) uses "document parsing.", which grounds. + prompt="document parsing.", + # Grounding emits ~3x the tokens of plain OCR, so it needs a larger budget + # and context to reach the article body the ground truth covers. + n_predict=4096, + n_ctx=16384, + strip_grounding=True, + ), } CASES = [ @@ -82,9 +103,26 @@ def chrf_min(self) -> float: # is one pixel off and lands at ~0.69 instead. hf_cer=0.7761, hf_chrf=28.70, cer_tol=0.12, chrf_tol=8.0, ), + TestCase( + model_key="unlimited", label="single-view scan", + image="tools/mtmd/test-1.jpeg", + ground_truth="tools/mtmd/tests/test-1-ground-truth.txt", + # HF reference: Unlimited-OCR scoring (gundam, bf16) on this image/ground-truth. + # Decoder runs full MHA, not R-SWA; the band absorbs that gap + bf16 variance. + hf_cer=0.1869, hf_chrf=75.23, cer_tol=0.06, chrf_tol=6.0, + ), ] +GROUNDING_TAG_RE = re.compile(r"<\|(ref|det)\|>.*?<\|/\1\|>", re.DOTALL) + + +def strip_grounding(text: str) -> str: + """Drop <|ref|>..<|/ref|> / <|det|>..<|/det|> grounding markup, matching the + cleaned result.md the HF reference scores against.""" + return GROUNDING_TAG_RE.sub("", text) + + def arg_dest(flag: str) -> str: return flag.lstrip("-").replace("-", "_") @@ -129,19 +167,19 @@ def compute_chrf(expected: str, ocr_out: str) -> float: return CHRF().sentence_score(ocr_out, [expected]).score -def run_mtmd_cli(model_path, mmproj_path, image_path, bin_path) -> str: +def run_mtmd_cli(spec: "ModelSpec", model_path, mmproj_path, image_path, bin_path) -> str: """Run mtmd-cli on the image and return its output.""" cmd = [ str(bin_path), "-m", str(model_path), "--mmproj", str(mmproj_path), "--image", str(image_path), - "-p", "Free OCR. ", + "-p", spec.prompt, "--chat-template", "deepseek-ocr", "--temp", "0", "--flash-attn", "off", # match the HF "eager" attention reference "--no-warmup", - "-n", "512", # cap loops on hard images (KV would otherwise fill) + "-n", str(spec.n_predict), # cap loops on hard images (KV would otherwise fill) # HF decodes with no_repeat_ngram_size; llama.cpp's analog is DRY. # Default DRY breakers include "\n", so they are cleared below. "--dry-multiplier", "0.8", @@ -150,6 +188,8 @@ def run_mtmd_cli(model_path, mmproj_path, image_path, bin_path) -> str: "--dry-penalty-last-n", "-1", "--dry-sequence-breaker", "none", ] + if spec.n_ctx is not None: + cmd += ["-c", str(spec.n_ctx)] logger.debug(f" command: {' '.join(cmd)}") try: @@ -164,6 +204,8 @@ def run_mtmd_cli(model_path, mmproj_path, image_path, bin_path) -> str: raise RuntimeError(f"llama-mtmd-cli failed with code {result.returncode}") output = result.stdout.decode("utf-8", errors="replace").strip() + if spec.strip_grounding: + output = strip_grounding(output) if not output: raise RuntimeError("llama-mtmd-cli produced no output on stdout") logger.info(f" output: {len(output)} chars") @@ -193,7 +235,7 @@ def evaluate(case: "TestCase", expected: str, ocr_out: str) -> bool: logger.info("") logger.info("=" * 60) - logger.info("Free OCR evaluation:") + logger.info("OCR evaluation:") logger.info("=" * 60) logger.info(f" CER {cer:>7.4f} (HF {case.hf_cer:.4f}, <= {case.cer_max:>7.4f} -> {verdict(cer_pass)})") logger.info(f" chrF (0-100) {chrf:>7.2f} (HF {case.hf_chrf:.2f}, >= {case.chrf_min:>7.2f} -> {verdict(chrf_pass)})") @@ -269,9 +311,9 @@ def main() -> int: expected = read_expected_text(ground_truth) logger.info(f" Image: {case.image}") logger.info(f" Expected text: {len(expected)} chars") - logger.info(" Running llama.cpp 'Free OCR'") + logger.info(f" Running llama.cpp prompt {model_spec.prompt!r}") try: - ocr_out = run_mtmd_cli(model, mmproj, image, binary) + ocr_out = run_mtmd_cli(model_spec, model, mmproj, image, binary) except RuntimeError as e: logger.error(f" Error: {e}") results[title] = False diff --git a/tools/parser/debug-template-parser.cpp b/tools/parser/debug-template-parser.cpp index 9c591a1f1186..50e8f1efb7d6 100644 --- a/tools/parser/debug-template-parser.cpp +++ b/tools/parser/debug-template-parser.cpp @@ -40,6 +40,7 @@ struct debug_options { bool enable_reasoning = true; bool debug_jinja = false; bool force_tool_call = false; + bool parallel_tool_calls = true; output_mode mode = output_mode::BOTH; input_message_type input_message = input_message_type::NONE; }; @@ -87,6 +88,7 @@ static void print_usage(const char * program_name) { LOG_ERR("\nOptions:\n"); LOG_ERR(" --no-tools Disable tool definitions\n"); LOG_ERR(" --force-tool-call Set tool calls to forced\n"); + LOG_ERR(" --parallel-tool-calls=0|1 Set parallel_tool_calls (default: 1)\n"); LOG_ERR(" --generation-prompt=0|1 Set add_generation_prompt (default: 1)\n"); LOG_ERR(" --enable-reasoning=0|1 Enable reasoning parsing (default: 1)\n"); LOG_ERR(" --output=MODE Output mode: analysis, template, both (default: both)\n"); @@ -121,6 +123,8 @@ static bool parse_options(int argc, char ** argv, debug_options & opts) { opts.debug_jinja = true; } else if (arg == "--no-tools") { opts.with_tools = false; + } else if (arg.rfind("--parallel-tool-calls=", 0) == 0) { + opts.parallel_tool_calls = parse_bool_option(arg.substr(22)); } else if (arg.rfind("--generation-prompt=", 0) == 0) { opts.generation_prompt = parse_bool_option(arg.substr(20)); } else if (arg.rfind("--enable-reasoning=", 0) == 0) { @@ -349,7 +353,7 @@ static autoparser::generation_params prepare_params(const debug_options & opts, params.tools = json(); params.tool_choice = COMMON_CHAT_TOOL_CHOICE_NONE; } - params.parallel_tool_calls = false; + params.parallel_tool_calls = opts.parallel_tool_calls; return params; } diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp index 840eefc2f5ac..15ef64c4b0ed 100644 --- a/tools/quantize/quantize.cpp +++ b/tools/quantize/quantize.cpp @@ -33,6 +33,7 @@ struct quant_option { static const std::vector QUANT_OPTIONS = { { "Q1_0", LLAMA_FTYPE_MOSTLY_Q1_0, " 1.125 bpw quantization", }, + { "Q2_0", LLAMA_FTYPE_MOSTLY_Q2_0, " 2.25 bpw quantization (group 64)", }, { "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 4.34G, +0.4685 ppl @ Llama-3-8B", }, { "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 4.78G, +0.4511 ppl @ Llama-3-8B", }, { "MXFP4_MOE",LLAMA_FTYPE_MOSTLY_MXFP4_MOE," MXFP4 MoE", }, diff --git a/tools/rpc/CMakeLists.txt b/tools/rpc/CMakeLists.txt index 20f114ad9bae..0eee9a922e77 100644 --- a/tools/rpc/CMakeLists.txt +++ b/tools/rpc/CMakeLists.txt @@ -1,4 +1,4 @@ -set(TARGET rpc-server) +set(TARGET ggml-rpc-server) add_executable(${TARGET} rpc-server.cpp) target_link_libraries(${TARGET} PRIVATE ggml) target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/tools/rpc/README.md b/tools/rpc/README.md index 05b7292c0324..655b65347e2d 100644 --- a/tools/rpc/README.md +++ b/tools/rpc/README.md @@ -4,8 +4,8 @@ > This example and the RPC backend are currently in a proof-of-concept development stage. As such, the functionality is fragile and > insecure. **Never run the RPC server on an open network or in a sensitive environment!** -The `rpc-server` allows exposing `ggml` devices on a remote host. -The RPC backend communicates with one or several instances of `rpc-server` and offloads computations to them. +The `ggml-rpc-server` allows exposing `ggml` devices on a remote host. +The RPC backend communicates with one or several instances of `ggml-rpc-server` and offloads computations to them. This can be used for distributed LLM inference with `llama.cpp` in the following way: ```mermaid @@ -14,15 +14,15 @@ flowchart TD rpcb<-->|TCP|srvb rpcb<-.->|TCP|srvn subgraph hostn[Host N] - srvn[rpc-server]<-.->dev4["CUDA0"] - srvn[rpc-server]<-.->dev5["CPU"] + srvn[ggml-rpc-server]<-.->dev4["CUDA0"] + srvn[ggml-rpc-server]<-.->dev5["CPU"] end subgraph hostb[Host B] - srvb[rpc-server]<-->dev3["Metal"] + srvb[ggml-rpc-server]<-->dev3["Metal"] end subgraph hosta[Host A] - srva[rpc-server]<-->dev["CUDA0"] - srva[rpc-server]<-->dev2["CUDA1"] + srva[ggml-rpc-server]<-->dev["CUDA0"] + srva[ggml-rpc-server]<-->dev2["CUDA1"] end subgraph host[Main Host] local["Local devices"]<-->ggml[llama-cli] @@ -33,7 +33,7 @@ flowchart TD class local,dev,dev2,dev3,dev4,dev5 devcls ``` -By default, `rpc-server` exposes all available accelerator devices on the host. +By default, `ggml-rpc-server` exposes all available accelerator devices on the host. If there are no accelerators, it exposes a single `CPU` device. ## Usage @@ -41,7 +41,7 @@ If there are no accelerators, it exposes a single `CPU` device. ### Remote hosts On each remote host, build the backends for each accelerator by adding `-DGGML_RPC=ON` to the build options. -For example, to build the `rpc-server` with support for CUDA accelerators: +For example, to build the `ggml-rpc-server` with support for CUDA accelerators: ```bash mkdir build-rpc-cuda @@ -50,10 +50,10 @@ cmake .. -DGGML_CUDA=ON -DGGML_RPC=ON cmake --build . --config Release ``` -When started, the `rpc-server` will detect and expose all available `CUDA` devices: +When started, the `ggml-rpc-server` will detect and expose all available `CUDA` devices: ```bash -$ bin/rpc-server +$ bin/ggml-rpc-server ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no ggml_cuda_init: found 1 CUDA devices: @@ -67,14 +67,14 @@ Devices: You can control the set of exposed CUDA devices with the `CUDA_VISIBLE_DEVICES` environment variable or the `--device` command line option. The following two commands have the same effect: ```bash -$ CUDA_VISIBLE_DEVICES=0 bin/rpc-server -p 50052 -$ bin/rpc-server --device CUDA0 -p 50052 +$ CUDA_VISIBLE_DEVICES=0 bin/ggml-rpc-server -p 50052 +$ bin/ggml-rpc-server --device CUDA0 -p 50052 ``` ### Main host On the main host build `llama.cpp` with the backends for the local devices and add `-DGGML_RPC=ON` to the build options. -Finally, when running `llama-cli` or `llama-server`, use the `--rpc` option to specify the host and port of each `rpc-server`: +Finally, when running `llama-cli` or `llama-server`, use the `--rpc` option to specify the host and port of each `ggml-rpc-server`: ```bash $ llama-cli -hf ggml-org/gemma-3-1b-it-GGUF -ngl 99 --rpc 192.168.88.10:50052,192.168.88.11:50052 @@ -90,7 +90,7 @@ This can speed up model loading significantly, especially when using large model To enable the cache, use the `-c` option: ```bash -$ bin/rpc-server -c +$ bin/ggml-rpc-server -c ``` By default, the cache is stored in the `$HOME/.cache/llama.cpp/rpc` directory and can be controlled via the `LLAMA_CACHE` environment variable. @@ -103,8 +103,8 @@ RDMA is enabled by default when `libibverbs` is found at build time. ### Troubleshooting -Use the `GGML_RPC_DEBUG` environment variable to enable debug messages from `rpc-server`: +Use the `GGML_RPC_DEBUG` environment variable to enable debug messages from `ggml-rpc-server`: ```bash -$ GGML_RPC_DEBUG=1 bin/rpc-server +$ GGML_RPC_DEBUG=1 bin/ggml-rpc-server ``` diff --git a/tools/server/CMakeLists.txt b/tools/server/CMakeLists.txt index 7d427431db93..b5c40884fd6e 100644 --- a/tools/server/CMakeLists.txt +++ b/tools/server/CMakeLists.txt @@ -15,8 +15,12 @@ add_library(${TARGET} STATIC server-common.h server-context.cpp server-context.h + server-stream.cpp + server-stream.h server-tools.cpp server-tools.h + server-schema.cpp + server-schema.h ) if (BUILD_SHARED_LIBS) diff --git a/tools/server/README-dev.md b/tools/server/README-dev.md index 0ff334724a39..dfc9004de549 100644 --- a/tools/server/README-dev.md +++ b/tools/server/README-dev.md @@ -57,6 +57,7 @@ The core architecture consists of the following components: - `server_tokens`: Unified representation of token sequences (supports both text and multimodal tokens); used by `server_task` and `server_slot`. - `server_prompt_checkpoint`: For recurrent (e.g., RWKV) and SWA models, stores snapshots of KV cache state. Enables reuse when subsequent requests share the same prompt prefix, saving redundant computation. - `server_models`: Standalone component for managing multiple backend instances (used in router mode). It is completely independent of `server_context`. +- `stream_session_manager`: Process wide owner of resumable SSE stream sessions (`g_stream_sessions`), keyed by conversation id. Backs the replay buffer that lets a client reattach to a generation after an HTTP disconnect. See the "Resumable streaming" section below. ```mermaid graph TD @@ -117,6 +118,58 @@ Here is an example trace of an API request for text completion: - As the response is stateless, `server_res_generator` calls `response->update()` to update the response with the current state. - `server_res_generator` then calls `response->to_json()` and passes the response to the HTTP layer. +### Resumable streaming (SSE replay buffer) + +By default a streaming generation is bound to its HTTP socket: when the socket drops (refresh, tab close, mobile background, transient network) the generation aborts and the live stream is lost. This feature keeps the generation running server side and lets a client reattach. + +It is opt in via the `X-Conversation-Id` header on `POST /v1/chat/completions`. Without the header the OAI strict path is unchanged. The conversation id is the only identity end to end (server map key, client localStorage key, route path), with an optional `::model` suffix for direct routing in router mode. + +The feature lives entirely in `server-stream.{h,cpp}` and rests on three types: + +- `stream_session`: a bounded ring buffer (4 MiB cap, oldest bytes drop first) plus a condvar. `append` pushes raw SSE bytes, `read_from` drains from any offset and blocks for live bytes or finalize, `finalize` wakes readers, `cancel` stops the producer. One conv maps to at most one live session. +- `stream_session_manager` (`g_stream_sessions`): owns all sessions keyed by conv id, enforces the one conv one session invariant via `create_or_replace`, and runs a GC thread that drops completed sessions past their TTL. +- `stream_pipe_producer` / `stream_pipe_consumer`: the write and read ends. The producer owns the session lifetime and finalizes it on destruction; the consumer is read only and never finalizes, so a reader detaching cannot kill a running generation. + +Producer side: `server_res_generator` attaches a producer pipe when the header is present. The HTTP content provider mirrors every chunk into the ring before writing it to the socket. While a pipe is attached, `stream_aware_should_stop` ignores peer disconnect, so a dropped socket does not stop generation: only an explicit `DELETE` does. When the peer leaves early, `on_complete` calls `close()`, which drains the rest of the generation into the ring on the http worker. + +Lifetime safety: the producer pipe holds a shared `alive` flag also captured by the session cancel hook. `~server_res_generator` calls `cleanup()` to clear that hook while the reader is still alive, so a `cancel` arriving during teardown can never call `stop()` on a freed response. This ordering is the most fragile part of the feature: finalizing or destroying the producer before `cleanup()` runs reintroduces a use after free. + +Consumer side: `GET /v1/stream/?from=N` opens a `text/event-stream` that replays buffered bytes from offset `N` and blocks for live bytes, so the browser reattaches like a fresh EventSource. An offset below the dropped prefix returns 400. + +Routes: + +- `GET /v1/stream/:conv_id?from=N`: replay or live reattach. +- `POST /v1/streams/lookup` with `{"conversation_ids": [...]}`: returns session status only for ids the caller already owns. There is no listing route, so live sessions cannot be enumerated (an earlier `GET /v1/streams` was removed for exactly this reason). +- `DELETE /v1/stream/:conv_id`: explicit Stop, idempotent (`evict_and_cancel`). + +Router mode binds the same paths to proxy handlers. A `conv_id -> child` map (`conv_models`), populated when a POST is routed, resolves the owning child in one lookup with no polling. The lookup groups ids per child; GET and DELETE proxy straight to the owner. This loopback REST hop is expected to move to a websocket IPC later, swapping only the transport. + +Lifecycle: `g_stream_sessions.start_gc()` runs in main after common init, `stop_gc()` runs first in `clean_up()` and finalizes every live session so no reader hangs. Reader blocking and the post drop drain both run on httplib worker threads, which block on a condvar rather than spin. + +| Constant | Value | Role | +| --- | --- | --- | +| `STREAM_SESSION_TTL_SECONDS` | 300 | retention of a completed session before GC | +| `STREAM_SESSION_MAX_BYTES` | 4 MiB | ring cap per session | +| `STREAM_SESSION_GC_INTERVAL_SECONDS` | 60 | GC tick | +| `STREAM_READ_WAKE_INTERVAL_MS` | 200 | read_from wake to recheck should_stop | +| `STREAM_LOOKUP_TIMEOUT_MS` | 250 | router to child loopback budget | + +```mermaid +graph TD + Client -- "POST + X-Conversation-Id" --> RG[server_res_generator] + RG -- attach --> Prod[stream_pipe_producer] + Prod -- "write, drain on peer drop" --> Sess + subgraph g_stream_sessions + Sess[stream_session: ring buffer, 4 MiB] + GC[GC thread] -- drop after TTL --> Sess + end + Sess -- read_from offset --> Cons[stream_pipe_consumer] + Cons -- "GET /v1/stream/:id?from=N" --> Client + DEL[DELETE /v1/stream/:id] -- evict_and_cancel --> Sess +``` + +The diagram shows the buffer touch points. The live wire (chunks streamed to the original client during a normal generation) is the producer's default output, described under "Producer side" above. + ### Testing `llama-server` includes an automated test suite based on `pytest`. @@ -180,6 +233,35 @@ That requires `JSON.stringify` when formatted to message content: } ``` +### Router mode: how child <--> router communicates + +Upon spawning a new child process using `subprocess`, both child and router listen to the stdout/stderr (combined) + +For the direction from child to router: +- Generic messages are logs, it will be forwarded to router's stdout +- Special state update messages are prefixed by `cmd_child_to_router:state:`, followed by a JSON. See `server_models::handle_child_state` for more + +For the direction from router to child: +- When server sends `cmd_router_to_child:exit`, the child should exit gracefully --> if after `DEFAULT_STOP_TIMEOUT` and the child is still running, force-kill it + +### Model management API (router mode) + +Model management API was added via PR [#23976](https://github.com/ggml-org/llama.cpp/pull/23976) + +The main goal of this API is to allow downloading models and/or removing models from the web UI. It relies on the model cache infrastructure under the hood to manage the list of models dynamically. + +Instead of building everything from the ground up (like what most AI agents will do when you ask them to implement a similar feature), we built on top of existing, already well-engineered components inside the codebase: +- Model cache infrastructure as mentioned above (`common/download.h`) +- Server response queue (`server-queue.h`). We use this feature to broadcast events to SSE clients. +- Server router thread management (`server-models.h`). We re-use the same thread model that is used for managing subprocess life cycle, except that we don't create a new subprocess, but launch the download right inside the thread. + +The flow for downloading a new model: +- POST request comes in --> `post_router_models` --> validation +- A new `llama-server` subprocess will be spawned with special `SERVER_CHILD_MODE_DOWNLOAD` +- Child process runs the download and report status back to router via stdin/out +- If a stop request comes in, the router asks the child process to stop (same mechanism as running a model in child process) +- Otherwise, upon completion, we call `load_models()` to refresh the list of models + ### Notable Related PRs - Initial server implementation: https://github.com/ggml-org/llama.cpp/pull/1443 @@ -194,6 +276,7 @@ That requires `JSON.stringify` when formatted to message content: - Speculative decoding: https://github.com/ggml-org/llama.cpp/pull/17808 and rework in https://github.com/ggml-org/llama.cpp/pull/17808 - INI presets: https://github.com/ggml-org/llama.cpp/pull/17859 (+ refactoring: https://github.com/ggml-org/llama.cpp/pull/18169) - Sleeping mode: https://github.com/ggml-org/llama.cpp/pull/18228 +- Resumable streaming (SSE replay buffer): https://github.com/ggml-org/llama.cpp/pull/23226 diff --git a/tools/server/README.md b/tools/server/README.md index b41491059020..e88bc5f28ad0 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -175,13 +175,12 @@ For the full list of features, please refer to [server's changelog](https://gith | `-np, --parallel N` | number of server slots (default: -1, -1 = auto)
(env: LLAMA_ARG_N_PARALLEL) | | `-cb, --cont-batching, -nocb, --no-cont-batching` | whether to enable continuous batching (a.k.a dynamic batching) (default: enabled)
(env: LLAMA_ARG_CONT_BATCHING) | | `-mm, --mmproj FILE` | path to a multimodal projector file. see tools/mtmd/README.md
note: if -hf is used, this argument can be omitted
(env: LLAMA_ARG_MMPROJ) | -| `-tk, --talker-model FILE` | path to the qwen3-omni talker gguf, enables the /v1/audio/speech endpoint
(env: LLAMA_ARG_TALKER_MODEL) | -| `-c2w, --code2wav-model FILE` | path to the qwen3-omni code2wav gguf, the talker code detokenizer
(env: LLAMA_ARG_CODE2WAV_MODEL) | | `-mmu, --mmproj-url URL` | URL to a multimodal projector file. see tools/mtmd/README.md
(env: LLAMA_ARG_MMPROJ_URL) | | `--mmproj-auto, --no-mmproj, --no-mmproj-auto` | whether to use multimodal projector file (if available), useful when using -hf (default: enabled)
(env: LLAMA_ARG_MMPROJ_AUTO) | | `--mmproj-offload, --no-mmproj-offload` | whether to enable GPU offloading for multimodal projector (default: enabled)
(env: LLAMA_ARG_MMPROJ_OFFLOAD) | | `--image-min-tokens N` | minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)
(env: LLAMA_ARG_IMAGE_MIN_TOKENS) | | `--image-max-tokens N` | maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)
(env: LLAMA_ARG_IMAGE_MAX_TOKENS) | +| `--mtmd-batch-max-tokens N` | maximum number of image tokens per batch when encoding images (default: 1024)
(env: LLAMA_ARG_MTMD_BATCH_MAX_TOKENS) | | `-a, --alias STRING` | set model name aliases, comma-separated (to be used by API)
(env: LLAMA_ARG_ALIAS) | | `--tags STRING` | set model tags, comma-separated (informational, not used for routing)
(env: LLAMA_ARG_TAGS) | | `--embd-normalize N` | normalisation for embeddings (default: 2) (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm) | @@ -190,23 +189,21 @@ For the full list of features, please refer to [server's changelog](https://gith | `--reuse-port` | allow multiple sockets to bind to the same port (default: disabled)
(env: LLAMA_ARG_REUSE_PORT) | | `--path PATH` | path to serve static files from (default: )
(env: LLAMA_ARG_STATIC_PATH) | | `--api-prefix PREFIX` | prefix path the server serves from, without the trailing slash (default: )
(env: LLAMA_ARG_API_PREFIX) | -| `--webui-config JSON` | [DEPRECATED: use --ui-config] JSON that provides default WebUI settings (overrides WebUI defaults)
(env: LLAMA_ARG_WEBUI_CONFIG) | -| `--ui-config JSON` | JSON that provides default UI settings (overrides UI defaults)
(env: LLAMA_ARG_UI_CONFIG) | -| `--webui-config-file PATH` | [DEPRECATED: use --ui-config-file] JSON file that provides default WebUI settings (overrides WebUI defaults)
(env: LLAMA_ARG_WEBUI_CONFIG_FILE) | -| `--ui-config-file PATH` | JSON file that provides default UI settings (overrides UI defaults)
(env: LLAMA_ARG_UI_CONFIG_FILE) | -| `--webui-mcp-proxy, --no-webui-mcp-proxy` | [DEPRECATED: use --ui-mcp-proxy/--no-ui-mcp-proxy] experimental: whether to enable MCP CORS proxy
(env: LLAMA_ARG_WEBUI_MCP_PROXY) | -| `--ui-mcp-proxy, --no-ui-mcp-proxy` | experimental: whether to enable MCP CORS proxy - do not enable in untrusted environments (default: disabled)
(env: LLAMA_ARG_UI_MCP_PROXY) | +| `--ui-config, --webui-config JSON` | JSON that provides default UI settings (overrides UI defaults)
(env: LLAMA_ARG_UI_CONFIG) | +| `--ui-config-file, --webui-config-file PATH` | JSON file that provides default UI settings (overrides UI defaults)
(env: LLAMA_ARG_UI_CONFIG_FILE) | +| `--ui-mcp-proxy, --webui-mcp-proxy, --no-ui-mcp-proxy, --no-webui-mcp-proxy` | experimental: whether to enable MCP CORS proxy - do not enable in untrusted environments (default: disabled)
(env: LLAMA_ARG_UI_MCP_PROXY) | | `--tools TOOL1,TOOL2,...` | experimental: whether to enable built-in tools for AI agents - do not enable in untrusted environments (default: no tools)
specify "all" to enable all tools
available tools: read_file, file_glob_search, grep_search, exec_shell_command, write_file, edit_file, apply_diff, get_datetime
(env: LLAMA_ARG_TOOLS) | -| `--webui, --no-webui` | [DEPRECATED: use --ui/--no-ui] whether to enable the Web UI
(env: LLAMA_ARG_WEBUI) | -| `--ui, --no-ui` | whether to enable the Web UI (default: enabled)
(env: LLAMA_ARG_UI) | +| `-ag, --agent, -no-ag, --no-agent` | whether to enable CORS proxy and all built-in tools - do not enable in untrusted environments (default: disabled)
(env: LLAMA_ARG_AGENT) | +| `--ui, --webui, --no-ui, --no-webui` | whether to enable the Web UI (default: enabled)
(env: LLAMA_ARG_UI) | | `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)
(env: LLAMA_ARG_EMBEDDINGS) | | `--rerank, --reranking` | enable reranking endpoint on server (default: disabled)
(env: LLAMA_ARG_RERANKING) | | `--api-key KEY` | API key to use for authentication, multiple keys can be provided as a comma-separated list (default: none)
(env: LLAMA_API_KEY) | -| `--api-key-file FNAME` | path to file containing API keys (default: none)
(env: LLAMA_ARG_API_KEY_FILE) | +| `--api-key-file FNAME` | path to file containing API keys, one per line; lines starting with a hash are treated as comments (default: none)
(env: LLAMA_ARG_API_KEY_FILE) | | `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key
(env: LLAMA_ARG_SSL_KEY_FILE) | | `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate
(env: LLAMA_ARG_SSL_CERT_FILE) | | `--chat-template-kwargs STRING` | sets additional params for the json template parser, must be a valid json object string, e.g. '{"key1":"value1","key2":"value2"}'
(env: LLAMA_ARG_CHAT_TEMPLATE_KWARGS) | | `-to, --timeout N` | server read/write timeout in seconds (default: 3600)
(env: LLAMA_ARG_TIMEOUT) | +| `--sse-ping-interval N` | server SSE ping interval in seconds (-1 = disabled, default: 30)
(env: LLAMA_ARG_SSE_PING_INTERVAL) | | `--threads-http N` | number of threads used to process HTTP requests (default: -1)
(env: LLAMA_ARG_THREADS_HTTP) | | `--cache-prompt, --no-cache-prompt` | whether to enable prompt caching (default: enabled)
(env: LLAMA_ARG_CACHE_PROMPT) | | `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting, requires prompt caching to be enabled (default: 0)
[(card)](https://ggml.ai/f0.png)
(env: LLAMA_ARG_CACHE_REUSE) | @@ -231,6 +228,7 @@ For the full list of features, please refer to [server's changelog](https://gith | `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.10, 0.0 = disabled) | | `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) | | `--sleep-idle-seconds SECONDS` | number of seconds of idleness after which the server will sleep (default: -1; -1 = disabled) | +| `--log-prompts-dir PATH` | Log prompts to directory (only used for debugging, default: disabled) | | `--spec-draft-hf, -hfd, -hfrd, --hf-repo-draft /[:quant]` | Same as --hf-repo, but for the draft model (default: unused)
(env: LLAMA_ARG_SPEC_DRAFT_HF_REPO) | | `--spec-draft-threads, -td, --threads-draft N` | number of threads to use during generation (default: same as --threads) | | `--spec-draft-threads-batch, -tbd, --threads-batch-draft N` | number of threads to use during batch and prompt processing (default: same as --threads-draft) | @@ -1232,8 +1230,6 @@ print(completion.choices[0].text) Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggml-org/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used. -If model supports multimodal, you can input the media file via `image_url` content part. We support both base64 and remote URL as input. See OAI documentation for more. - *Options:* See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). llama.cpp `/completion`-specific features such as `mirostat` are also supported. @@ -1252,9 +1248,18 @@ The `response_format` parameter supports both plain JSON output (e.g. `{"type": `parallel_tool_calls` : Whether to enable parallel/multiple tool calls (only supported on some models, verification is based on jinja template). -For multimodal input: -- Content type `image_url` and `input_audio` are the same as OAI schema -- Content type `input_video` is an extension from OAI schema. For now, it only accepts base64 input +For multimodal input (typed content, `messages[i].content[j]`): +- If `type == "image_url"`: + - `image_url.url` can be a remote URL, base64 (raw or URI-encoded via `data:image/...;base64`) or path to local file + - Accepts formats supported by `stb_image` (jpeg, png, tga, bmp, gif, ...) +- If `type == "input_audio"`: + - Either `input_audio.data` or `input_audio.url` can be specified, can be a remote URL, raw base64 or path to local file + - Accepts formats supported by `miniaudio` (mp3, wav, flac) + - `input_audio.format` will be ignored, the file format will be determined automatically +- If `type == "input_video"`: + - Either `input_video.data` or `input_video.url` can be specified, can be a remote URL, raw base64 or path to local file + - Accepts formats supported by `ffmpeg` +- Note: for local file, make sure to set `--media-path`. File path must be prefixed by `file://` *Examples:* @@ -1778,6 +1783,20 @@ The `status` object can be: } ``` +Note: for "downloading" state, there can be multiple files be downloading in parallel + +```json +"status": { + "value": "downloading", + "progress": { + "https://...model.gguf": { + "done": 195963406, + "total": 219307424 + } + } +} +``` + ### POST `/models/load`: Load a model Load a model @@ -1820,6 +1839,135 @@ Response: } ``` +### GET `/models/sse`: Real-time events + +Example events: + +```js +{ + "model": "...", + "event": "model_status", + "data": { + "status": "loading" + } +} + +{ + "model": "...", + "event": "download_progress", + "data": { + // note: there can be multiple files being downloaded in parallel + "https://...model.gguf": { + "done": 195963406, + "total": 219307424 + } + } +} + +{ + "model": "...", + "event": "model_status", + "data": { + "status": "loading", + "progress": { + "stages": ["text_model", "spec_model", "mmproj_model"], + "current": "text_model", + "value": 0.5 + } + } +} +// note for "loading" status: +// - subsequent events will follow the same order of "stages" list +// - mmap is may report incorrect progress on some platforms; if you need exact progress, use --no-mmap + +{ + "model": "...", + "event": "model_status", + "data": { + "status": "loaded", + "info": { + // note: only include info on first load + // waking up from sleep doesn't have this + } + } +} + +{ + "model": "...", + "event": "model_status", + "data": { + "status": "sleeping" + } +} + +{ + "model": "...", + "event": "model_remove" +} + +// special event: reload of the list of all models +{ + "model": "*", + "event": "models_reload" +} +``` + +### POST `/models`: Download new model + +Trigger a new download (non-blocking), the progress can be tracked via SSE endpoint `/models/sse` + +To cancel model downloading, send an event to `/models/unload` + +Download procedure: +- Send POST request to `/models` +- Subscribe to `/models/sse` for updates +- On downloading completed, you will receive either `download_finished` or `download_failed` event +- Call GET `/models` to trigger model list update. If the download success, you should see the new model in the list + +Payload: + +```json +{ + "model": "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M", +} +``` + +Response (download is started in the background): + +```json +{ + "success": true +} +``` + +Response (error, cannot start the download): + +```json +{ + "error": { + "code": 400, + "message": "model validation failed, unable to download", + "type": "invalid_request_error" + } +} +``` + +### DELETE `/models`: Delete a model from cache + +IMPORTANT: only model stored in cache can be deleted. You cannot delete models in a preset. + +Model name must be passed via query param: `?model={name}` + +If delete success, it will send an SSE event of type `model_remove` + +Response: + +```json +{ + "success": true +} +``` + ## API errors `llama-server` returns errors in the same format as OAI: https://github.com/openai/openai-openapi diff --git a/tools/server/bench/bench.py b/tools/server/bench/bench.py index c816816eaf46..2c56ab5ebcd6 100644 --- a/tools/server/bench/bench.py +++ b/tools/server/bench/bench.py @@ -40,6 +40,7 @@ def main(args_in: list[str] | None = None) -> None: required=True) parser.add_argument("--hf-repo", type=str, help="Hugging Face model repository", required=True) parser.add_argument("--hf-file", type=str, help="Hugging Face model file", required=True) + parser.add_argument("--offline", action="store_true", default=False, help="Offline mode: forces use of cache, prevents network access") parser.add_argument("-ngl", "--n-gpu-layers", type=int, help="layers to the GPU for computation", required=True) parser.add_argument("--ctx-size", type=int, help="Set the size of the prompt context", required=True) parser.add_argument("--parallel", type=int, help="Set the number of slots for process requests", required=True) @@ -268,6 +269,8 @@ def start_server_background(args): ] server_args.extend(['--hf-repo', args.hf_repo]) server_args.extend(['--hf-file', args.hf_file]) + if args.offline: + server_args.append('--offline') server_args.extend(['--n-gpu-layers', args.n_gpu_layers]) server_args.extend(['--ctx-size', args.ctx_size]) server_args.extend(['--parallel', args.parallel]) diff --git a/tools/server/server-common.cpp b/tools/server/server-common.cpp index 9f3caac8f723..ac291d359a0c 100644 --- a/tools/server/server-common.cpp +++ b/tools/server/server-common.cpp @@ -12,6 +12,7 @@ #include #include #include +#include json format_error_response(const std::string & message, const enum error_type type) { std::string type_str; @@ -344,6 +345,14 @@ const mtmd::input_chunk_ptr & server_tokens::find_chunk(size_t idx) const { throw std::runtime_error("Chunk not found"); } +std::pair server_tokens::find_next_media_chunk(size_t idx) const { + auto it = map_idx_to_media.upper_bound(idx); + if (it != map_idx_to_media.end()) { + return { &it->second, it->first }; + } + return { nullptr, 0 }; +} + void server_tokens::push_back(llama_token tok) { if (tok == LLAMA_TOKEN_NULL) { throw std::runtime_error("Invalid token"); @@ -509,6 +518,14 @@ size_t server_tokens::get_common_prefix(const server_tokens & b) const { return max_idx; // all tokens are equal } +common_chat_msg_spans server_tokens::find_message_spans(const common_chat_msg_delimiters & delims) const { + std::map skips; + for (const auto & it : map_idx_to_media) { + skips[it.first] = mtmd_input_chunk_get_n_tokens(it.second.get()); + } + return delims.split(tokens, skips); +} + bool server_tokens::validate(const struct llama_context * ctx) const { const llama_model * model = llama_get_model(ctx); const llama_vocab * vocab = llama_model_get_vocab(model); @@ -531,37 +548,6 @@ bool server_tokens::validate(const struct llama_context * ctx) const { return true; } -int32_t server_tokens::process_chunk( - llama_context * ctx, - mtmd_context * mctx, - size_t idx, - llama_pos pos, - int32_t seq_id, - size_t & n_tokens_out) const { - const auto & chunk = find_chunk(idx); - const char * name = mtmd_input_chunk_get_type(chunk.get()) == MTMD_INPUT_CHUNK_TYPE_IMAGE - ? "image" : "audio"; - SRV_INF("processing %s...\n", name); - int32_t n_batch = llama_n_batch(ctx); - int64_t t0 = ggml_time_ms(); - llama_pos new_n_past; // unused for now - int32_t result = mtmd_helper_eval_chunk_single(mctx, ctx, - chunk.get(), - pos, - seq_id, - n_batch, - true, // logits last - &new_n_past); - SRV_INF("%s processed in %" PRId64 " ms\n", name, ggml_time_ms() - t0); - if (result != 0) { - LOG_ERR("mtmd_helper_eval failed with status %d", result); - n_tokens_out = 0; - return result; - } - n_tokens_out = mtmd_input_chunk_get_n_tokens(chunk.get()); - return 0; -} - server_tokens server_tokens::clone() const { server_tokens res; res.has_mtmd = has_mtmd; @@ -839,12 +825,21 @@ json oaicompat_completion_params_parse(const json & body) { return llama_params; } -// media_path always end with '/', see arg.cpp +// url can be +// - http(s):// for remote files +// - file:// for local files (only allowed if media_path is set) +// - data: for base64 encoded data with uri scheme (e.g. data:image/png;base64,...) +// - raw base64 encoded data static void handle_media( std::vector & out_files, - json & media_obj, - const std::string & media_path) { - std::string url = json_value(media_obj, "url", std::string()); + const std::string & url, + const std::string & media_path, + bool accept_base64_uri) { + if (!media_path.empty()) { + // should already be enforced by arg.cpp, but checking just in case + GGML_ASSERT(media_path.back() == DIRECTORY_SEPARATOR); + } + if (string_starts_with(url, "http")) { // download remote image // TODO @ngxson : maybe make these params configurable @@ -880,20 +875,28 @@ static void handle_media( data.assign((std::istreambuf_iterator(file)), std::istreambuf_iterator()); out_files.push_back(data); - } else { + } else if (accept_base64_uri && string_starts_with(url, "data:")) { // try to decode base64 image std::vector parts = string_split(url, /*separator*/ ','); if (parts.size() != 2) { - throw std::runtime_error("Invalid url value"); + throw std::runtime_error("Invalid uri-encoded base64 value"); } else if (!string_starts_with(parts[0], "data:image/")) { - throw std::runtime_error("Invalid url format: " + parts[0]); + throw std::runtime_error("Invalid uri format: " + parts[0]); } else if (!string_ends_with(parts[0], "base64")) { - throw std::runtime_error("url must be base64 encoded"); + throw std::runtime_error("uri must be base64 encoded"); } else { auto base64_data = parts[1]; auto decoded_data = base64_decode(base64_data); out_files.push_back(decoded_data); } + + } else { + // try as raw base64 string + auto decoded_data = base64_decode(url); + if (decoded_data.empty()) { + throw std::runtime_error("Invalid base64 value"); + } + out_files.push_back(decoded_data); } } @@ -979,14 +982,15 @@ json oaicompat_chat_params_parse( } for (auto & p : content) { - std::string type = json_value(p, "type", std::string()); + std::string type = json_value(p, "type", std::string()); if (type == "image_url") { if (!opt.allow_image) { throw std::runtime_error("image input is not supported - hint: if this is unexpected, you may need to provide the mmproj"); } json image_url = json_value(p, "image_url", json::object()); - handle_media(out_files, image_url, opt.media_path); + std::string url = json_value(image_url, "url", std::string()); + handle_media(out_files, url, opt.media_path, true); p["type"] = "media_marker"; p["text"] = get_media_marker(); @@ -997,17 +1001,11 @@ json oaicompat_chat_params_parse( throw std::runtime_error("audio input is not supported - hint: if this is unexpected, you may need to provide the mmproj"); } - json input_audio = json_value(p, "input_audio", json::object()); - std::string data = json_value(input_audio, "data", std::string()); - std::string format = json_value(input_audio, "format", std::string()); - // while we also support flac, we don't allow it here so we matches the OAI spec - if (format != "wav" && format != "mp3") { - throw std::invalid_argument("input_audio.format must be either 'wav' or 'mp3'"); - } - auto decoded_data = base64_decode(data); // expected to be base64 encoded - out_files.push_back(decoded_data); - - // TODO: add audio_url support by reusing handle_media() + // note: don't need to validate "format", it's redundant + json input_audio = json_value(p, "input_audio", json::object()); + std::string url = json_value(input_audio, "data", + json_value(input_audio, "url", std::string())); + handle_media(out_files, url, opt.media_path, false); p["type"] = "media_marker"; p["text"] = get_media_marker(); @@ -1018,10 +1016,10 @@ json oaicompat_chat_params_parse( throw std::runtime_error("video input is not supported - hint: if this is unexpected, you may need to provide the mmproj"); } - json input_video = json_value(p, "input_video", json::object()); - std::string data = json_value(input_video, "data", std::string()); - auto decoded_data = base64_decode(data); // expected to be base64 encoded - out_files.push_back(decoded_data); + json input_video = json_value(p, "input_video", json::object()); + std::string url = json_value(input_video, "data", + json_value(input_video, "url", std::string())); + handle_media(out_files, url, opt.media_path, false); p["type"] = "media_marker"; p["text"] = get_media_marker(); @@ -1114,21 +1112,13 @@ json oaicompat_chat_params_parse( llama_params["chat_parser"] = chat_params.parser; } - llama_params["message_spans"] = json::array(); - - for (const auto & span : chat_params.message_spans) { - llama_params["message_spans"].push_back({ - { "role", span.role }, - { "pos", span.pos }, - { "len", span.len }, - }); - } + llama_params["message_delimiters"] = chat_params.message_delimiters.to_json(); // Reasoning budget: pass parameters through to sampling layer { - int reasoning_budget = opt.reasoning_budget; - if (reasoning_budget == -1 && body.contains("thinking_budget_tokens")) { - reasoning_budget = json_value(body, "thinking_budget_tokens", -1); + int reasoning_budget = json_value(body, "thinking_budget_tokens", -1); + if (reasoning_budget == -1) { + reasoning_budget = opt.reasoning_budget; } if (!chat_params.thinking_end_tag.empty()) { @@ -1261,7 +1251,7 @@ json format_response_rerank( // other utils // -std::vector get_token_probabilities(llama_context * ctx, int idx) { +std::vector get_token_probabilities(llama_context * ctx, int idx, size_t n_top) { std::vector cur; const auto * logits = llama_get_logits_ith(ctx, idx); @@ -1280,21 +1270,34 @@ std::vector get_token_probabilities(llama_context * ctx, int i } } - // sort tokens by logits - std::sort(cur.begin(), cur.end(), [](const llama_token_data & a, const llama_token_data & b) { - return a.logit > b.logit; - }); + // sort tokens by logits (partial: only the leading `n_top` need ordering) + if (n_top > cur.size()) { + n_top = cur.size(); + } + if (n_top > 0) { + std::partial_sort(cur.begin(), cur.begin() + n_top, cur.end(), + [](const llama_token_data & a, const llama_token_data & b) { + return a.logit > b.logit; + }); + } // apply softmax - float max_l = cur[0].logit; + float max_l = -std::numeric_limits::infinity(); + if (n_top > 0) { + max_l = cur[0].logit; // partial_sort guarantees the absolute maximum is at index 0 + } else { + for (const auto & t : cur) { + max_l = std::max(max_l, t.logit); + } + } float cum_sum = 0.0f; - for (size_t i = 0; i < cur.size(); ++i) { - float p = expf(cur[i].logit - max_l); - cur[i].p = p; + for (auto & t : cur) { + float p = expf(t.logit - max_l); + t.p = p; cum_sum += p; } - for (size_t i = 0; i < cur.size(); ++i) { - cur[i].p /= cum_sum; + for (auto & t : cur) { + t.p /= cum_sum; } return cur; diff --git a/tools/server/server-common.h b/tools/server/server-common.h index 249b97c2fadb..c0eaec6b0267 100644 --- a/tools/server/server-common.h +++ b/tools/server/server-common.h @@ -180,6 +180,10 @@ struct server_tokens { const mtmd::input_chunk_ptr & find_chunk(size_t idx) const; + // find next media chunk after idx + // returns a pair of pointer to the chunk (nullptr if not found) and its start index in tokens + std::pair find_next_media_chunk(size_t idx) const; + void push_back(llama_token tok); // will create a copy of the chunk if it contains non-text data @@ -214,18 +218,12 @@ struct server_tokens { size_t get_common_prefix(const server_tokens & b) const; + // split the tokens into message spans, skipping over media chunks + common_chat_msg_spans find_message_spans(const common_chat_msg_delimiters & delims) const; + // make sure all text tokens are within the vocab range bool validate(const struct llama_context * ctx) const; - // encode and decode the image chunk - int32_t process_chunk( - llama_context * ctx, - mtmd_context * mctx, - size_t idx, - llama_pos pos, - int32_t seq_id, - size_t & n_tokens_out) const; - server_tokens clone() const; }; @@ -331,7 +329,7 @@ json format_response_rerank( // other utils // -std::vector get_token_probabilities(llama_context * ctx, int idx); +std::vector get_token_probabilities(llama_context * ctx, int idx, size_t n_top); std::string safe_json_to_str(const json & data); diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index bdfa51718080..5c33a418f549 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -4,6 +4,8 @@ #include "server-http.h" #include "server-task.h" #include "server-queue.h" +#include "server-schema.h" +#include "server-stream.h" #include "build-info.h" #include "common.h" @@ -15,11 +17,6 @@ #include "mtmd.h" #include "mtmd-helper.h" -#include "ggml-cpp.h" - -// TODO: tmp until the mtmd draft processing is refactored [TAG_MTMD_DRAFT_PROCESSING] -#include "../../src/llama-ext.h" - #include #include #include @@ -67,9 +64,99 @@ enum slot_state { SLOT_STATE_GENERATING, }; -enum server_state { - SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet - SERVER_STATE_READY, // Server is ready and model is loaded +struct server_slot; // forward declaration + +struct server_batch { + llama_batch batch; + bool batch_rendered = false; + + struct token { + int32_t id_slot; + llama_token token; + llama_pos pos; + bool output; + }; + std::vector tokens; + int32_t n_tokens_alloc = 0; + + // track if given slot can be batched with slots already in the batch + server_slot * slot_batched = nullptr; + + float alora_scale = -1.0f; + size_t alora_disabled_id = 0; + + server_batch() { + batch.token = nullptr; // sentinel: uninitialized batch + } + + ~server_batch() { + if (batch.token != nullptr) { + llama_batch_free(batch); + } + } + + void init(int32_t n_tokens_alloc) { + this->n_tokens_alloc = n_tokens_alloc; + batch = llama_batch_init(n_tokens_alloc, 0, 1); + tokens.reserve(n_tokens_alloc); + } + + bool add(int32_t id_slot, llama_token token, llama_pos pos, bool output) { + GGML_ASSERT(batch.token != nullptr); + if ((int32_t)tokens.size() >= n_tokens_alloc) { + return false; + } + // LOG_INF("adding token to batch: slot=%d, token=%d, pos=%d, output=%d\n", id_slot, token, pos, output); + tokens.push_back({ id_slot, token, pos, output }); + return true; + } + + void clear() { + tokens.clear(); + common_batch_clear(batch); + slot_batched = nullptr; + alora_scale = -1.0f; + alora_disabled_id = 0; + batch_rendered = false; + } + + int32_t size() const { + return (int32_t)tokens.size(); + } + + void set_output(int32_t idx, bool output) { + GGML_ASSERT(idx >= 0 && idx < (int32_t)tokens.size()); + tokens[idx].output = output; + } + + void render() { + GGML_ASSERT(batch.token != nullptr); + common_batch_clear(batch); + for (int32_t i = 0; i < size(); i++) { + const auto & t = tokens[i]; + common_batch_add(batch, t.token, t.pos, { t.id_slot }, t.output); + } + batch_rendered = true; + } + + llama_batch get_view(int32_t off, int32_t n_tokens) const { + GGML_ASSERT(batch.token != nullptr); + GGML_ASSERT(batch_rendered); + GGML_ASSERT(off >= 0 && off < size()); + GGML_ASSERT(n_tokens > 0 && off + n_tokens <= size()); + + llama_batch view = { + n_tokens, + batch.token + off, + nullptr, + batch.pos + off, + batch.n_seq_id + off, + batch.seq_id + off, + batch.logits + off, + }; + + return view; + } }; struct server_slot { @@ -80,6 +167,7 @@ struct server_slot { // multimodal mtmd_context * mctx = nullptr; + mtmd::batch_ptr mbatch = nullptr; // speculative decoding common_speculative * spec; @@ -193,9 +281,11 @@ struct server_slot { // stats size_t n_sent_text = 0; // number of sent text character - int64_t t_print_last = 0; + // TODO @ngxson : move all metrics to a sub-struct for clarity int64_t t_start_process_prompt; int64_t t_start_generation; + int64_t t_print_last = 0; + int32_t n_decoded_last = 0; double t_prompt_processing = 0.0; // ms double t_token_generation = 0.0; // ms @@ -205,6 +295,8 @@ struct server_slot { // Speculative decoding stats int32_t n_draft_total = 0; // Total draft tokens generated int32_t n_draft_accepted = 0; // Draft tokens actually accepted + int32_t n_draft_verif_steps = 0; // Total draft token verification steps by the target model + std::vector n_accepted_per_pos; // Accepted tokens per draft position void reset() { SLT_DBG(*this, "%s", "\n"); @@ -231,6 +323,8 @@ struct server_slot { // clear speculative decoding stats n_draft_total = 0; n_draft_accepted = 0; + n_draft_verif_steps = 0; + n_accepted_per_pos.clear(); task_prev = std::move(task); task.reset(); @@ -239,6 +333,9 @@ struct server_slot { // clear alora start alora_invocation_start = -1; + + // clear multimodal state + mbatch.reset(); } void init_sampler() const { @@ -348,12 +445,14 @@ struct server_slot { return n_draft_max; } - void update_batch(llama_batch & batch) { + // add sampled token of this slot to the batch, optionally add the speculative draft tokens if any + void handle_last_sampled_token(server_batch & batch) { + bool add_ok = true; if (spec_draft.empty()) { // no speculative decoding - i_batch = batch.n_tokens; + i_batch = batch.size(); - common_batch_add(batch, sampled, prompt.tokens.pos_next(), { this->id }, true); + add_ok &= batch.add(id, sampled, prompt.tokens.pos_next(), true); SLT_DBG(*this, "slot decode token, id=%d, n_ctx = %d, n_tokens = %d, truncated = %d\n", sampled, n_ctx, prompt.n_tokens(), truncated); @@ -363,19 +462,21 @@ struct server_slot { GGML_ASSERT(spec_i_batch.empty()); - spec_i_batch.push_back(batch.n_tokens); + spec_i_batch.push_back(batch.size()); for (size_t i = 0; i < spec_draft.size(); i++) { - spec_i_batch.push_back(batch.n_tokens + i + 1); + spec_i_batch.push_back(batch.size() + i + 1); } auto pos0 = prompt.tokens.pos_next(); - common_batch_add(batch, sampled, pos0++, { this->id }, true); + add_ok &= batch.add(id, sampled, pos0++, true); for (auto token : spec_draft) { - common_batch_add(batch, token, pos0++, { this->id }, true); + add_ok &= batch.add(this->id, token, pos0++, true); } } + GGML_ASSERT(add_ok && "batch must be large enough to hold the sampled and draft tokens"); + prompt.tokens.push_back(sampled); prompt.tokens.insert(spec_draft); } @@ -467,11 +568,13 @@ struct server_slot { return; } - t_print_last = t_now; + const double n_gen_second = 1e3 / (t_token_generation) * (n_decoded); + const double n_gen_second_win = 1e6 / (t_now - t_print_last) * (n_decoded - n_decoded_last); - const double n_gen_second = 1e3 / t_token_generation * n_decoded; + t_print_last = t_now; + n_decoded_last = n_decoded; - SLT_INF(*this, "n_decoded = %6d, tg = %6.2f t/s\n", n_decoded, n_gen_second); + SLT_INF(*this, "n_decoded = %6d, tg = %6.2f t/s, tg_3s = %6.2f t/s\n", n_decoded, n_gen_second, n_gen_second_win); } void print_timings_pp() const { @@ -510,10 +613,22 @@ struct server_slot { llama_perf_context(ctx_tgt).n_reused); if (n_draft_total > 0) { - const float draft_ratio = (float) n_draft_accepted / n_draft_total; + const float draft_ratio = (float) n_draft_accepted / n_draft_total; + const double mean_acc_len = n_draft_verif_steps > 0 ? 1.0 + (double) n_draft_accepted / (double) n_draft_verif_steps : 1.0; + + std::string acceptance_rates_per_pos; + if (n_draft_verif_steps > 0) { + for (size_t i = 0; i < n_accepted_per_pos.size(); ++i) { + if (i > 0) { + acceptance_rates_per_pos += ", "; + } + acceptance_rates_per_pos += string_format("%.3f", (double) n_accepted_per_pos[i] / (double) n_draft_verif_steps); + } + } + SLT_INF(*this, - "draft acceptance = %0.5f (%5d accepted / %5d generated)\n", - draft_ratio, n_draft_accepted, n_draft_total); + "draft acceptance = %0.5f (%5d accepted / %5d generated), mean acceptance length = %5.2f, acceptance rate per position = (%s)\n", + draft_ratio, n_draft_accepted, n_draft_total, mean_acc_len, acceptance_rates_per_pos.c_str()); } common_speculative_print_stats(spec); @@ -578,6 +693,94 @@ struct server_slot { other.prompt = prompt.clone(); other.init_sampler(); } + + // returns 0 on success + // caller need to update prompt.tokens after a successful call to keep track of the processing progress + int process_mtmd_chunk(size_t idx, size_t & n_tokens_out) { + GGML_ASSERT(mctx); + const auto & input_tokens = task->tokens; + const auto & chunk = input_tokens.find_chunk(idx); + int32_t res = 0; + + auto try_decode = [&]() -> int32_t { + if (mbatch) { + float * embd = mtmd_batch_get_output_embd(mbatch.get(), chunk.get()); + if (embd) { + void * cb_data = spec; + static auto cb = [](llama_batch batch, void * user_data) { + common_speculative * spec = static_cast(user_data); + if (!common_speculative_process(spec, batch)) { + return 1; + } + return 0; + }; + + llama_pos new_n_past; // unused for now + res = mtmd_helper_decode_image_chunk( + mctx, + ctx_tgt, + chunk.get(), + embd, + prompt.tokens.pos_next(), + id, + llama_n_batch(ctx_tgt), + &new_n_past, + cb, + cb_data + ); + if (res != 0) { + SLT_ERR(*this, "failed to decode mtmd chunk, idx = %zu, res = %d\n", idx, res); + return -1; + } + n_tokens_out = mtmd_input_chunk_get_n_tokens(chunk.get()); + return 0; // success + } + } + return 1; // (non-error) need to create & encode batch + }; + + // if the batch is already exist, try searching & encode + res = try_decode(); + if (res == 0) { + return 0; + } + if (res < 0) { + // fatal error + return res; + } + + // otherwise, the batch is either uninitialized or is used up + // we need to create & encode a new batch + mbatch.reset(mtmd_batch_init(mctx)); + res = mtmd_batch_add_chunk(mbatch.get(), chunk.get()); + GGML_ASSERT(res == 0); // we should never have an empty batch + + // try batching as much as possible + int n_added = 1; + size_t idx_cur = idx; + while (res == 0) { + auto [next_chunk, next_idx] = input_tokens.find_next_media_chunk(idx_cur); + if (next_chunk == nullptr) { + break; + } + res = mtmd_batch_add_chunk(mbatch.get(), next_chunk->get()); + n_added += (res == 0 ? 1 : 0); + idx_cur = next_idx; + SLT_DBG(*this, "try adding media chunk idx = %zu to batch, res = %d\n", next_idx, res); + // if res != 0, batch is full or chunk is not compatible -> this loop breaks + } + + // TODO @ngxson : move this log line to debug when it become more stable + SLT_INF(*this, "encoding mtmd batch from idx = %zu, n_chunks = %d\n", idx, n_added); + + res = mtmd_batch_encode(mbatch.get()); + if (res != 0) { + SLT_ERR(*this, "failed to encode mtmd batch for chunk idx = %zu, res = %d\n", idx, res); + return -1; + } + + return try_decode(); + } }; @@ -666,6 +869,8 @@ struct server_context_impl { // note: chat_params must not be refreshed upon existing sleeping state server_chat_params chat_params; + server_state_callback_t callback_state = [](server_state, json) -> void {}; + server_context_impl() { mtmd_helper_log_set(common_log_default_callback, nullptr); } @@ -689,7 +894,7 @@ struct server_context_impl { llama_context * ctx_tgt = nullptr; - llama_batch batch {}; + server_batch batch; llama_model_ptr model_dft; llama_context_ptr ctx_dft; @@ -718,8 +923,7 @@ struct server_context_impl { server_metrics metrics; - json json_ui_settings = json::object(); // Primary: new name - json json_webui_settings = json::object(); // Deprecated: use json_ui_settings instead (kept for compat) + json json_ui_settings = json::object(); // Necessary similarity of prompt for slot selection float slot_prompt_similarity = 0.0f; @@ -730,6 +934,8 @@ struct server_context_impl { bool sleeping = false; + int64_t t_last_load_progress_ms = 0; + void destroy() { spec.reset(); ctx_dft.reset(); @@ -742,8 +948,6 @@ struct server_context_impl { mtmd_free(mctx); mctx = nullptr; - - llama_batch_free(batch); } void handle_sleeping_state(bool new_state) { @@ -760,18 +964,77 @@ struct server_context_impl { sleeping = new_state; } + struct load_progress_data { + server_context_impl * ctx; + std::string stage; + std::vector stages; + int64_t t_last_load_progress_ms = 0; + load_progress_data(server_context_impl * ctx, const std::string & stage) : ctx(ctx), stage(stage) {} + }; + static bool load_progress_callback(float progress, void * user_data) { + auto * d = static_cast(user_data); + GGML_ASSERT(d); + // always emit the first and final sample; throttle the rest to one per 200ms + { + auto & t_last = d->t_last_load_progress_ms; + const int64_t t_now = ggml_time_ms(); + const bool first = t_last == 0; + const bool done = progress >= 1.0f; + const bool throttled = !first && !done && (t_now - t_last) < 200; + if (throttled) { + return true; + } + t_last = t_now; + } + if (d->ctx->callback_state) { + d->ctx->callback_state(SERVER_STATE_LOADING, { + {"stages", d->stages}, + {"current", d->stage}, + {"value", progress}, + }); + } + return true; + } + // load the model and initialize llama_context // this may also be called to resume from sleeping state bool load_model(common_params & params) { - bool is_resume = sleeping; + load_progress_data load_progress_text (this, "text_model"); + load_progress_data load_progress_mmproj(this, "mmproj_model"); + load_progress_data load_progress_spec (this, "spec_model"); - SRV_INF("loading model '%s'\n", params.model.path.c_str()); + const bool is_resume = sleeping; params_base = params; params_base.n_outputs_max = server_n_outputs_max(params_base); + const bool has_mmproj = !params.mmproj.path.empty(); + const bool has_draft = params.speculative.has_dft(); + const bool spec_mtp = std::find(params_base.speculative.types.begin(), + params_base.speculative.types.end(), + COMMON_SPECULATIVE_TYPE_DRAFT_MTP) != params_base.speculative.types.end(); + const bool has_spec = has_draft || spec_mtp; + + if (callback_state) { + std::vector stages = {"text_model"}; + if (has_spec) { + stages.push_back("spec_model"); + } + if (has_mmproj) { + stages.push_back("mmproj_model"); + } + load_progress_text.stages = stages; + load_progress_mmproj.stages = stages; + load_progress_spec.stages = stages; + + // trigger 0% progress + load_progress_callback(0.0f, &load_progress_text); + } + + + SRV_INF("loading model '%s'\n", params.model.path.c_str()); + std::string & mmproj_path = params_base.mmproj.path; - bool has_mmproj = !mmproj_path.empty(); mtmd_context_params mparams = mtmd_context_params_default(); if (has_mmproj) { mparams.use_gpu = params_base.mmproj_use_gpu; @@ -781,18 +1044,24 @@ struct server_context_impl { mparams.warmup = params_base.warmup; mparams.image_min_tokens = params_base.image_min_tokens; mparams.image_max_tokens = params_base.image_max_tokens; + mparams.batch_max_tokens = params_base.mtmd_batch_max_tokens; mparams.media_marker = get_media_marker(); + // progress callback + mparams.progress_callback = load_progress_callback; + mparams.progress_callback_user_data = &load_progress_mmproj; } // optionally get the memory usage of mmproj if (has_mmproj && params_base.fit_params) { + int64_t t_start = ggml_time_us(); auto mmproj_mem = mtmd_get_memory_usage(mmproj_path.c_str(), mparams); + int64_t t_elapsed = ggml_time_us() - t_start; if (!mmproj_mem.empty()) { size_t total = 0; for (auto & [dev, size] : mmproj_mem) { total += size; } - SRV_INF("[mtmd] estimated worst-case memory usage of mmproj is %.2f MiB\n", total / (1024.0 * 1024.0)); + SRV_INF("[mtmd] estimated worst-case memory usage of mmproj is %.2f MiB (took %.2f ms)\n", total / (1024.0 * 1024.0), t_elapsed / 1000.0); GGML_ASSERT(!params_base.fit_params_target.empty()); for (auto & [dev, size] : mmproj_mem) { for (size_t i = 0; i < ggml_backend_dev_count(); i++) { @@ -812,12 +1081,7 @@ struct server_context_impl { // optionally reserve VRAM for the draft / MTP context before fitting the target model if (params_base.fit_params) { - const bool spec_mtp = std::find(params_base.speculative.types.begin(), - params_base.speculative.types.end(), - COMMON_SPECULATIVE_TYPE_DRAFT_MTP) != params_base.speculative.types.end(); - const bool has_draft = params_base.speculative.has_dft(); - - if (has_draft || spec_mtp) { + if (has_spec) { common_params params_dft = params_base; bool measure_model_bytes = true; @@ -866,10 +1130,7 @@ struct server_context_impl { } for (size_t j = 0; j < devs.size(); ++j) { - const size_t bytes = - (measure_model_bytes ? dmd[j].mb.model : 0) + - dmd[j].mb.context + - dmd[j].mb.compute; + const size_t bytes = (measure_model_bytes ? dmd[j].model : 0) + dmd[j].context + dmd[j].compute; total += bytes; for (size_t i = 0; i < tgt_devices.size(); i++) { if (tgt_devices[i] == devs[j]) { @@ -890,6 +1151,12 @@ struct server_context_impl { } } + // attach a progress callback + { + params_base.load_progress_callback = load_progress_callback; + params_base.load_progress_callback_user_data = &load_progress_text; + } + llama_init = common_init_from_params(params_base); model_tgt = llama_init->model(); @@ -906,7 +1173,7 @@ struct server_context_impl { add_bos_token = llama_vocab_get_add_bos(vocab); - if (params_base.speculative.has_dft()) { + if (has_draft) { // TODO speculative: move to common/speculative.cpp? const auto & params_spec = params_base.speculative.draft; @@ -929,6 +1196,10 @@ struct server_context_impl { auto mparams_dft = common_model_params_to_llama(params_dft); + // progress callback + mparams_dft.progress_callback = load_progress_callback; + mparams_dft.progress_callback_user_data = &load_progress_spec; + model_dft.reset(llama_model_load_from_file(params_dft.model.path.c_str(), mparams_dft)); if (model_dft == nullptr) { SRV_ERR("failed to load draft model, '%s'\n", params_dft.model.path.c_str()); @@ -937,10 +1208,6 @@ struct server_context_impl { auto cparams = common_context_params_to_llama(params_dft); - const bool spec_mtp = std::find(params_base.speculative.types.begin(), - params_base.speculative.types.end(), - COMMON_SPECULATIVE_TYPE_DRAFT_MTP) != params_base.speculative.types.end(); - if (spec_mtp) { cparams.ctx_type = LLAMA_CONTEXT_TYPE_MTP; } @@ -951,11 +1218,17 @@ struct server_context_impl { cparams.ctx_other = ctx_tgt; ctx_dft.reset(llama_init_from_model(model_dft.get(), cparams)); + if (ctx_dft == nullptr) { + SRV_ERR("%s", "failed to create draft context\n"); + return false; + } params_base.speculative.draft.ctx_tgt = ctx_tgt; params_base.speculative.draft.ctx_dft = ctx_dft.get(); - } else if (std::find(params_base.speculative.types.begin(), params_base.speculative.types.end(), - COMMON_SPECULATIVE_TYPE_DRAFT_MTP) != params_base.speculative.types.end()) { + } else if (spec_mtp) { + // no new model load, so we simply report 0.0 and 1.0 progress + load_progress_callback(0.0f, &load_progress_spec); + SRV_INF("creating MTP draft context against the target model '%s'\n", params_base.model.path.c_str()); @@ -975,9 +1248,15 @@ struct server_context_impl { params_base.speculative.draft.ctx_tgt = ctx_tgt; params_base.speculative.draft.ctx_dft = ctx_dft.get(); + + load_progress_callback(1.0f, &load_progress_spec); } if (has_mmproj) { + if (callback_state) { + callback_state(SERVER_STATE_LOADING, {{"stage", "mmproj_model"}}); + } + if (!is_resume) { mtmd_helper_log_set(common_log_default_callback, nullptr); } @@ -1113,7 +1392,7 @@ struct server_context_impl { // note that n_batch can be > n_ctx (e.g. for non-causal attention models such as BERT where the KV cache is not used) { const int32_t n_batch = llama_n_batch(ctx_tgt); - batch = llama_batch_init(std::max(n_batch, params_base.n_parallel), 0, 1); + batch.init(std::max(n_batch, params_base.n_parallel)); } if (params_base.cache_ram_mib != 0) { @@ -1140,8 +1419,8 @@ struct server_context_impl { if (!params_base.model_alias.empty()) { // backward compat: use first alias as model name model_name = *params_base.model_alias.begin(); - } else if (!params_base.model.name.empty()) { - model_name = params_base.model.name; + } else if (!params_base.model.get_name().empty()) { + model_name = params_base.model.get_name(); } else { // fallback: derive model name from file name auto model_path = std::filesystem::path(params_base.model.path); @@ -1158,6 +1437,10 @@ struct server_context_impl { return init(); } + if (callback_state) { + callback_state(SERVER_STATE_READY, {}); + } + return true; } @@ -1197,16 +1480,12 @@ struct server_context_impl { } } - // populate UI settings (from either new ui_config_json or deprecated webui_config_json) { - const std::string & cfg = !params_base.ui_config_json.empty() - ? params_base.ui_config_json - : params_base.webui_config_json; + const std::string & cfg = params_base.ui_config_json; if (!cfg.empty()) { try { json json_settings = json::parse(cfg); json_ui_settings = json_settings; - json_webui_settings = json_settings; // deprecated: keep in sync } catch (const std::exception & e) { SRV_ERR("%s: failed to parse UI config: %s\n", __func__, e.what()); return false; @@ -1238,6 +1517,9 @@ struct server_context_impl { const bool enable_thinking = params_base.enable_reasoning != 0 && template_supports_thinking; SRV_INF("%s: chat template, thinking = %d\n", __func__, enable_thinking); + // IMPORTANT: chat_params is reused across sleeping / resuming states, + // never store llama_context/llama_model pointers in chat_params, + // as they may be invalidated after sleeping chat_params = { /* use_jinja */ params_base.use_jinja, /* prefill_assistant */ params_base.prefill_assistant, @@ -1290,11 +1572,23 @@ struct server_context_impl { bool update_cache = false; + // if a specific slot is requested, use it (still goes through cache update logic below) + if (task.id_slot != -1) { + ret = get_slot_by_id(task.id_slot); + if (ret) { + SLT_INF(*ret, "selected slot by id (%d)\n", task.id_slot); + } + } + // find the slot that has at least n% prompt similarity - if (ret == nullptr && slot_prompt_similarity != 0.0f) { + if (slot_prompt_similarity != 0.0f) { float sim_best = 0; for (server_slot & slot : slots) { + if (task.id_slot != -1 && slot.id != task.id_slot) { + continue; + } + // skip the slot if it is not available if (slot.is_processing()) { continue; @@ -1321,8 +1615,10 @@ struct server_context_impl { if (ret != nullptr) { const float f_keep = (sim_best*task.tokens.size()) / ret->prompt.tokens.size(); - SLT_INF(*ret, "selected slot by LCP similarity, sim_best = %.3f (> %.3f thold), f_keep = %.3f\n", - sim_best, slot_prompt_similarity, f_keep); + if (task.id_slot == -1) { + SLT_INF(*ret, "selected slot by LCP similarity, sim_best = %.3f (> %.3f thold), f_keep = %.3f\n", + sim_best, slot_prompt_similarity, f_keep); + } // if we are about to lose a large portion of the existing context - save it in the prompt cache if (f_keep < 0.5f) { @@ -1710,8 +2006,7 @@ struct server_context_impl { }); } } else { - // TODO: optimize this with min-p optimization - std::vector cur = get_token_probabilities(ctx_tgt, idx); + std::vector cur = get_token_probabilities(ctx_tgt, idx, n_probs_request); const size_t max_probs = cur.size(); const size_t n_probs = std::min(max_probs, n_probs_request); @@ -2046,10 +2341,15 @@ struct server_context_impl { auto & cur = slot.prompt.checkpoints.emplace_back(); + // [TAG_CHECKPOINTS_FIX_POS_MIN] + // TODO: here we incorrectly deterimne that the saved checkpoint data covers the [pos_min, pos_max] range + // this is not true for SWA models: https://github.com/ggml-org/llama.cpp/pull/24411#issuecomment-4677983225 cur.update_pos(slot.prompt.n_tokens() - n_tokens_cur, pos_min, pos_max); cur.update_tgt(ctx_tgt, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); cur.update_dft(ctx_dft.get(), slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); + // stash the draft's speculative state with the checkpoint + common_speculative_get_state(spec.get(), slot.id, cur.data_spec); SLT_INF(slot, "created context checkpoint %d of %d (pos_min = %d, pos_max = %d, n_tokens = %" PRId64 ", size = %.3f MiB)\n", @@ -2072,10 +2372,9 @@ struct server_context_impl { } } - const int id_slot = task.id_slot; const int id_task = task.id; - server_slot * slot = id_slot != -1 ? get_slot_by_id(id_slot) : get_available_slot(task); + server_slot * slot = get_available_slot(task); // // slot scheduling logic @@ -2383,7 +2682,83 @@ struct server_context_impl { } } + void iterate(std::vector & slots, std::function callback) { + for (auto & slot : slots) { + try { + callback(slot); + } catch (const std::exception & e) { + SLT_ERR(slot, "got exception: %s\n", e.what()); + send_error(slot, std::string("got exception: ") + e.what(), ERROR_TYPE_SERVER); + slot.release(); + } + } + } + + void iterate(std::vector & slots, std::function callback) { + for (auto & slot : slots) { + try { + callback(*slot); + } catch (const std::exception & e) { + SLT_ERR(*slot, "got exception: %s\n", e.what()); + send_error(*slot, std::string("got exception: ") + e.what(), ERROR_TYPE_SERVER); + slot->release(); + } + } + } + + void abort_all_slots(const std::string & reason) { + for (auto & slot : slots) { + if (slot.is_processing()) { + send_error(slot, reason, ERROR_TYPE_SERVER); + slot.release(); + } + } + } + + // @ngxson : for debugging only + int64_t t_pre_decode = 0; + int64_t t_decode = 0; + int64_t t_post_decode = 0; + int64_t t_sampl = 0; + int64_t n_pre_decode = 0; + int64_t n_decode = 0; + int64_t n_post_decode = 0; + int64_t n_sampl = 0; +// #define DEBUG_TIMINGS +#ifdef DEBUG_TIMINGS + struct scoped_timer { + int64_t & t; + int64_t & n; + int64_t t_start; + scoped_timer(int64_t & t_, int64_t & n_) : t(t_), n(n_) { + t_start = ggml_time_us(); + } + ~scoped_timer() { + t += ggml_time_us() - t_start; + n++; + } + }; +#else + struct scoped_timer { + scoped_timer(int64_t &, int64_t &) {} + ~scoped_timer() {} + }; +#endif + void update_slots() { +#ifdef DEBUG_TIMINGS + static int64_t t_prev = 0; + int64_t t_start = ggml_time_us(); + if (t_start - t_prev > 5 * 1000 * 1000) { // every 5 seconds + t_prev = t_start; + SRV_INF("n_pre_decode = %" PRId64 "\n", n_pre_decode); + SRV_INF("avg t_pre_decode = %f ms\n", (double) t_pre_decode / n_pre_decode / 1000.0); + SRV_INF("avg t_decode = %f ms\n", (double) t_decode / n_decode / 1000.0); + SRV_INF("avg t_post_decode = %f ms\n", (double) t_post_decode / n_post_decode / 1000.0); + SRV_INF("avg t_sampl = %f ms\n", (double) t_sampl / n_sampl / 1000.0); + } +#endif + // check if all slots are idle { bool all_idle = true; @@ -2397,29 +2772,80 @@ struct server_context_impl { if (all_idle) { SRV_INF("%s", "all slots are idle\n"); + return; // skip further processing - return; + } else { + SRV_DBG("%s", "posting NEXT_RESPONSE\n"); + + server_task task(SERVER_TASK_TYPE_NEXT_RESPONSE); + task.id = queue_tasks.get_new_id(); + queue_tasks.post(std::move(task)); } } - { - SRV_DBG("%s", "posting NEXT_RESPONSE\n"); + try { + scoped_timer t(t_pre_decode, n_pre_decode); + pre_decode(); + batch.render(); + } catch (const std::exception & e) { + SRV_ERR("pre_decode() failed: %s\n", e.what()); + abort_all_slots("pre_decode() failed: " + std::string(e.what())); + } + + llama_batch batch_view; + int32_t off_next = 0; + int32_t n_batch = llama_n_batch(ctx_tgt); + for (int32_t off = 0; off < batch.size(); off = off_next) { + const int32_t n_tokens = std::min(n_batch, batch.size() - off); + try { + scoped_timer t(t_decode, n_decode); + // TODO @ngxson : maybe handle n_batch == 1 here instead of inside decode() + + batch_view = batch.get_view(off, n_tokens); + bool ok = decode(n_batch, off, batch_view); +#ifdef DEBUG_TIMINGS + llama_synchronize(ctx_tgt); +#endif + + if (ok) { + // move the head of the batch forward with the number of tokens we just processed + off_next = off + n_tokens; + + // on successful decode, restore the original batch size + n_batch = llama_n_batch(ctx_tgt); + } else { + // try again with the updated n_batch + continue; + } + } catch (const std::exception & e) { + SRV_ERR("decode() failed: %s\n", e.what()); + abort_all_slots("decode() failed: " + std::string(e.what())); + break; // stop any further processing + } + + try { + scoped_timer t(t_post_decode, n_post_decode); + post_decode(n_tokens, off, batch_view); + } catch (const std::exception & e) { + SRV_ERR("post_decode() failed: %s\n", e.what()); + abort_all_slots("post_decode() failed: " + std::string(e.what())); + break; // stop any further processing + } - server_task task(SERVER_TASK_TYPE_NEXT_RESPONSE); - task.id = queue_tasks.get_new_id(); - queue_tasks.post(std::move(task)); } + } + void pre_decode() { // apply context-shift if needed // TODO: simplify and improve - for (server_slot & slot : slots) { + iterate(slots, [&](server_slot & slot) { if (slot.state == SLOT_STATE_GENERATING && slot.prompt.n_tokens() + 1 >= slot.n_ctx) { if (!params_base.ctx_shift) { // this check is redundant (for good) // we should never get here, because generation should already stopped in process_token() send_error(slot, "context shift is disabled", ERROR_TYPE_SERVER); slot.release(); - continue; + return; } if (mctx) { @@ -2431,7 +2857,7 @@ struct server_context_impl { if (slot.task->is_parent() || slot.task->is_child()) { send_error(slot, "context shift cannot be used for shared prompt", ERROR_TYPE_SERVER); slot.release(); - continue; + return; } // Shift context @@ -2444,7 +2870,10 @@ struct server_context_impl { n_keep = std::min(slot.n_ctx - 4, n_keep); const int n_left = slot.prompt.n_tokens() - n_keep; - const int n_discard = slot.task->params.n_discard ? slot.task->params.n_discard : (n_left / 2); + int n_discard = slot.task->params.n_discard ? slot.task->params.n_discard : (n_left / 2); + + // ref: https://github.com/ggml-org/llama.cpp/pull/24786 + n_discard = std::clamp(n_discard, 0, std::max(0, n_left - 1)); SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard); @@ -2474,28 +2903,28 @@ struct server_context_impl { slot.truncated = true; } - } + }); // start populating the batch for this iteration - common_batch_clear(batch); + batch.clear(); // track if given slot can be batched with slots already in the batch - server_slot * slot_batched = nullptr; + auto & slot_batched = batch.slot_batched; std::vector generating; std::vector drafting; // determine which slots are generating and drafting - for (auto & slot : slots) { + iterate(slots, [&](server_slot & slot) { if (slot.state != SLOT_STATE_GENERATING) { - continue; + return; } // check if we can batch this slot with the previous one if (!slot_batched) { slot_batched = &slot; } else if (!slot_batched->can_batch_with(slot)) { - continue; + return; } generating.push_back(&slot); @@ -2543,7 +2972,7 @@ struct server_context_impl { } } } - } + }); // generate the actual drafts (if any) { @@ -2551,9 +2980,7 @@ struct server_context_impl { } // make checkpoints if needed - for (auto * slot_ptr : drafting) { - auto & slot = *slot_ptr; - + iterate(drafting, [&](server_slot & slot) { auto & draft = slot.spec_draft; auto & ckpt = slot.spec_ckpt; @@ -2596,38 +3023,42 @@ struct server_context_impl { ckpt.update_dft(ctx_dft.get(), slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); } } - } + }); // update the batch with the sampled/drafted tokens - for (auto * slot_ptr : generating) { - auto & slot = *slot_ptr; - - slot.update_batch(batch); - } + iterate(generating, [&](server_slot & slot) { + slot.handle_last_sampled_token(batch); + }); // process in chunks of params.n_batch int32_t n_batch = llama_n_batch(ctx_tgt); int32_t n_ubatch = llama_n_ubatch(ctx_tgt); - float alora_scale = -1.0f; - size_t alora_disabled_id = 0; + auto & alora_scale = batch.alora_scale; + auto & alora_disabled_id = batch.alora_disabled_id; // next, batch any pending prompts without exceeding n_batch - if (params_base.cont_batching || batch.n_tokens == 0) { - for (auto & slot : slots) { + if (params_base.cont_batching || batch.size() == 0) { + bool add_ok = true; // false means the batch is full, skip remaining slots + + iterate(slots, [&](server_slot & slot) { + if (!add_ok || batch.size() >= n_batch) { + return; // batch is full, skip remaining slots + } + if (!slot.is_processing()) { - continue; + return; } // check if we can batch this slot with the previous one if (slot_batched && !slot_batched->can_batch_with(slot)) { - continue; + return; } // check if this is a child slot if (slot.state == SLOT_STATE_WAIT_OTHER) { SLT_DBG(slot, "%s", "waiting for parent slot to complete\n"); - continue; + return; } // this slot still has a prompt to be processed @@ -2635,7 +3066,7 @@ struct server_context_impl { const auto & input_tokens = slot.task->tokens; // used to determine the number of tokens added to the batch for the current slot - const auto n_tokens_prev = batch.n_tokens; + const auto n_tokens_prev = batch.size(); // TODO: maybe move branch to outside of this loop in the future if (slot.state == SLOT_STATE_STARTED) { @@ -2671,14 +3102,14 @@ struct server_context_impl { send_final_response(slot); slot.release(); - continue; + return; } // TODO: support memory-less logits computation if (slot.task->need_logits() && !llama_get_memory(ctx_tgt)) { send_error(slot, "the current context does not logits computation. skipping", ERROR_TYPE_SERVER); slot.release(); - continue; + return; } if (!slot.can_split()) { @@ -2690,7 +3121,7 @@ struct server_context_impl { slot.task->n_tokens(), n_ubatch), ERROR_TYPE_SERVER); slot.release(); - continue; + return; } if (slot.task->n_tokens() > slot.n_ctx) { @@ -2701,7 +3132,7 @@ struct server_context_impl { slot.task->n_tokens(), slot.n_ctx), ERROR_TYPE_EXCEED_CONTEXT_SIZE); slot.release(); - continue; + return; } } else { if (slot.task->n_tokens() >= slot.n_ctx) { @@ -2711,7 +3142,7 @@ struct server_context_impl { slot.task->n_tokens(), slot.n_ctx), ERROR_TYPE_EXCEED_CONTEXT_SIZE); slot.release(); - continue; + return; } if (slot.task->params.cache_prompt) { @@ -2860,6 +3291,10 @@ struct server_context_impl { // guarantee that a checkpoint will result in at least one token being processed [TAG_PROMPT_LOGITS] LOG_INF("slot %12.*s: id %2d | task %d | Checking checkpoint with [%d, %d] against %d...\n", 12, func_name, (slot).id, ((slot).task ? (slot).task->id : -1), cur.pos_min, cur.pos_max, pos_min_thold); + // workaround for [TAG_CHECKPOINTS_FIX_POS_MIN] + if (cur.pos_max > pos_next) { + return false; + } return cur.pos_min < pos_min_thold || cur.pos_min == 0; } ); @@ -2870,6 +3305,8 @@ struct server_context_impl { // restore the context checkpoint it->load_tgt(ctx_tgt, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); it->load_dft(ctx_dft.get(), slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); + // restore the draft's speculative state + common_speculative_set_state(spec.get(), slot.id, it->data_spec); pos_next = std::min(pos_next, std::max(it->pos_min + 1, it->pos_max)); n_past = std::min(slot.prompt.tokens.size_up_to_pos(pos_next), (size_t) it->n_tokens); @@ -2921,17 +3358,17 @@ struct server_context_impl { send_partial_response(slot, {}, false, true); } } - } + } // end of SLOT_STATE_STARTED if (!slot.can_split()) { // cannot fit the prompt in the current batch - will try next iter - if (batch.n_tokens + slot.task->n_tokens() > n_batch) { - continue; + if (batch.size() + slot.task->n_tokens() > n_batch) { + return; } } - const int64_t t_current = ggml_time_us(); - slot.t_prompt_processing = (t_current - slot.t_start_process_prompt) / 1e3; + const int64_t t_now = ggml_time_us(); + slot.t_prompt_processing = (t_now - slot.t_start_process_prompt) / 1e3; slot.print_timings_pp(); // truncate any tokens that are beyond n_past for this slot @@ -2976,10 +3413,18 @@ struct server_context_impl { bool has_mtmd = false; // check if we should process the image - while (slot.prompt.n_tokens() < slot.task->n_tokens() && input_tokens[slot.prompt.n_tokens()] == LLAMA_TOKEN_NULL) { + while (true) { + auto cur_token_idx = slot.prompt.n_tokens(); + if ( + cur_token_idx >= slot.task->n_tokens() || + input_tokens[cur_token_idx] != LLAMA_TOKEN_NULL // encountered a text token + ) { + break; + } + // process the image size_t n_tokens_out = 0; - int32_t res = input_tokens.process_chunk(ctx_tgt, mctx, slot.prompt.n_tokens(), slot.prompt.tokens.pos_next(), slot.id, n_tokens_out); + int32_t res = slot.process_mtmd_chunk(cur_token_idx, n_tokens_out); if (res != 0) { SLT_ERR(slot, "failed to process image, res = %d\n", res); send_error(slot, "failed to process image", ERROR_TYPE_SERVER); @@ -2987,33 +3432,22 @@ struct server_context_impl { continue; } - if (ctx_dft && llama_get_ctx_other(ctx_dft.get()) != ctx_tgt) { - // TODO: in the future, figure out how to infuse target embeddings to the images - // for now, we skip this for simplicity - // maybe we simply need to call `common_speculative_process()` on the mtmd batches in the `process_chunk` above? - // [TAG_MTMD_DRAFT_PROCESSING] - res = input_tokens.process_chunk(ctx_dft.get(), mctx, slot.prompt.n_tokens(), slot.prompt.tokens.pos_next(), slot.id, n_tokens_out); - if (res != 0) { - GGML_ABORT("failed to process multi-modal data on draft context\n"); - } - } - slot.n_prompt_tokens_processed += n_tokens_out; // add the image chunk to cache { - const auto & chunk = input_tokens.find_chunk(slot.prompt.n_tokens()); + const auto & chunk = input_tokens.find_chunk(cur_token_idx); slot.prompt.tokens.push_back(chunk.get()); // copy } has_mtmd = true; } - const int32_t n_before_user = slot.task->params.n_before_user; - const bool n_before_user_known = n_before_user > 0; + const auto & spans = slot.task->params.message_spans; + const auto last_user_pos = spans.last_user_message_pos(); // add prompt tokens for processing in the current batch - while (slot.prompt.n_tokens() < slot.task->n_tokens() && batch.n_tokens < n_batch) { + while (slot.prompt.n_tokens() < slot.task->n_tokens() && batch.size() < n_batch) { // get next token to process llama_token cur_tok = input_tokens[slot.prompt.n_tokens()]; if (cur_tok == LLAMA_TOKEN_NULL) { @@ -3031,19 +3465,16 @@ struct server_context_impl { // embedding requires all tokens in the batch to be output; // MTP also wants logits at every prompt position so the // streaming hook can mirror t_h_nextn into ctx_dft. - common_batch_add(batch, + add_ok &= batch.add(slot.id, cur_tok, slot.prompt.tokens.pos_next(), - { slot.id }, slot.need_embd()); slot.prompt.tokens.push_back(cur_tok); slot.n_prompt_tokens_processed++; - // stop the prompt batch exactly before the latest user input, so a checkpoint - // can be created after the previous messages - if (n_before_user_known && - slot.prompt.n_tokens() == n_before_user) { + // stop the prompt batch exactly before a user message + if (spans.is_user_start(slot.prompt.n_tokens())) { break; } @@ -3070,26 +3501,32 @@ struct server_context_impl { } // the number of tokens added to the batch for the current slot - const auto n_tokens_cur = batch.n_tokens - n_tokens_prev; + const auto n_tokens_cur = batch.size() - n_tokens_prev; + + const auto n_tokens_start = slot.prompt.n_tokens() - n_tokens_cur; const bool near_prompt_end = slot.task->n_tokens() < slot.prompt.n_tokens() + n_ubatch; + const bool is_user_start = spans.is_user_start(n_tokens_start); + const bool is_last_user_message = n_tokens_start == last_user_pos; + // entire prompt has been processed if (slot.prompt.n_tokens() == slot.task->n_tokens()) { slot.state = SLOT_STATE_DONE_PROMPT; - GGML_ASSERT(batch.n_tokens > 0); + GGML_ASSERT(batch.size() > 0); // extract the logits only for the last token - batch.logits[batch.n_tokens - 1] = true; + batch.set_output(batch.size() - 1, true); slot.n_decoded = 0; - slot.i_batch = batch.n_tokens - 1; + slot.i_batch = batch.size() - 1; slot.init_sampler(); } else { - // skip ordinary mid-prompt checkpoints - if (!n_before_user_known && !near_prompt_end) { + // skip ordinary mid-prompt checkpoints, unless the batch starts a user + // message or we are near the end of the prompt + if (!is_user_start && !near_prompt_end) { do_checkpoint = false; } } @@ -3097,29 +3534,6 @@ struct server_context_impl { const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx_tgt), slot.id); const auto pos_max = llama_memory_seq_pos_max(llama_get_memory(ctx_tgt), slot.id); - // checkpoints are created before the current batch is decoded, so - // their token position is the batch start rather than the prompt end - const int32_t n_tokens_start = slot.prompt.n_tokens() - n_tokens_cur; - - { - const bool is_on_user = - n_before_user_known && - n_tokens_start == n_before_user; - - const bool is_after_user = - n_before_user_known && - n_tokens_start > n_before_user; - - const bool is_allowed = - !n_before_user_known || - is_on_user || - (is_after_user && near_prompt_end); - - if (do_checkpoint && !is_allowed) { - do_checkpoint = false; - } - } - // nothing to checkpoint yet // TODO: is this check needed? if (do_checkpoint && pos_min < 0) { @@ -3129,8 +3543,8 @@ struct server_context_impl { // do not checkpoint after mtmd chunks do_checkpoint = do_checkpoint && !has_mtmd; - // no need to create checkpoints that are too close together - do_checkpoint = do_checkpoint && (slot.prompt.checkpoints.empty() || n_tokens_start > slot.prompt.checkpoints.back().n_tokens + params_base.checkpoint_min_step); + // no need to create checkpoints that are too close together, unless it's the last user message + do_checkpoint = do_checkpoint && (slot.prompt.checkpoints.empty() || is_last_user_message || n_tokens_start > slot.prompt.checkpoints.back().n_tokens + params_base.checkpoint_min_step); SLT_DBG(slot, "main/do_checkpoint = %s, pos_min = %d, pos_max = %d\n", do_checkpoint ? "yes" : "no", pos_min, pos_max); // note: we create the checkpoint before calling llama_decode(), so the current batch is not @@ -3143,20 +3557,20 @@ struct server_context_impl { if (!slot_batched) { slot_batched = &slot; } - - if (batch.n_tokens >= n_batch) { - break; - } - } + }); } + } - SRV_DBG("decoding batch, n_tokens = %d\n", batch.n_tokens); + // returns true = success ; false = retry with smaller batch size + // throw std::runtime_error on fatal error + bool decode(int32_t & n_batch, int32_t off, llama_batch & batch_view) { + SRV_DBG("n_batch (effective) = %d, off = %d\n", n_batch, off); - auto accept_special_token = [&](server_slot & slot, llama_token token) { - return params_base.special || - slot.task->params.sampling.preserved_tokens.find(token) != slot.task->params.sampling.preserved_tokens.end(); - }; + auto & slot_batched = batch.slot_batched; + auto & alora_scale = batch.alora_scale; + auto & alora_disabled_id = batch.alora_disabled_id; + // TODO @ngxson : alora handling is too messy, need to refactor it to be more clear and maintainable if (slot_batched) { // apply lora, only need to do it once per batch common_set_adapter_lora(ctx_tgt, slot_batched->lora); @@ -3171,372 +3585,348 @@ struct server_context_impl { llama_set_embeddings(ctx_tgt, slot_batched->need_embd()); } - if (batch.n_tokens == 0) { + if (batch.size() == 0) { SRV_WRN("%s", "no tokens to decode\n"); if (++n_empty_consecutive > 3) { GGML_ABORT("fatal error - please provide logs and repro in %s\n", "https://github.com/ggml-org/llama.cpp/pull/20277"); } + + return true; // nothing to decode } else { n_empty_consecutive = 0; } - int32_t i_next = 0; + const int ret = llama_decode(ctx_tgt, batch_view); - // process the created batch of tokens - for (int32_t i = 0; i < batch.n_tokens; i = i_next) { - const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i); + metrics.on_decoded(slots); - llama_batch batch_view = { - n_tokens, - batch.token + i, - nullptr, - batch.pos + i, - batch.n_seq_id + i, - batch.seq_id + i, - batch.logits + i, - }; - - const int ret = llama_decode(ctx_tgt, batch_view); - - metrics.on_decoded(slots); - - if (ret != 0) { - { - std::string err; + if (ret != 0) { + { + std::string err; - if (n_batch == 1 && ret == 1) { - // TODO: try to terminate only the largest active slot/sequence and continue with the rest - // need to remove the tokens from the current batch too - err = "Context size has been exceeded."; - } + if (n_batch == 1 && ret == 1) { + // TODO: try to terminate only the largest active slot/sequence and continue with the rest + // need to remove the tokens from the current batch too + err = "Context size has been exceeded."; + } - if (ret == -1) { - err = "Invalid input batch."; - } + if (ret == -1) { + err = "Invalid input batch."; + } - if (ret < -1) { - // TODO: update slot state based on llama_memory_seq_pos_min() and llama_memory_seq_pos_max() - err = "Compute error."; - } + if (ret < -1) { + // TODO: update slot state based on llama_memory_seq_pos_min() and llama_memory_seq_pos_max() + err = "Compute error."; + } - // TODO: handle ret == 2 (abort) when we start aborting + // TODO: handle ret == 2 (abort) when we start aborting - if (!err.empty()) { - SRV_ERR("%s i = %d, n_batch = %d, ret = %d\n", err.c_str(), i, n_batch, ret); + if (!err.empty()) { + SRV_ERR("%s off = %d, n_batch = %d, ret = %d\n", err.c_str(), off, n_batch, ret); - for (auto & slot : slots) { - if (slot.is_processing()) { - send_error(slot, err); - slot.release(); + for (auto & slot : slots) { + if (slot.is_processing()) { + send_error(slot, err); + slot.release(); - // note: it's complicated to keep track of how much of the current batch has been - // processed before the error occurred, so we simply clear the entire context - slot.prompt_clear(false); - } + // note: it's complicated to keep track of how much of the current batch has been + // processed before the error occurred, so we simply clear the entire context + slot.prompt_clear(false); } - - break; } - } - // retry with half the batch size to try to find a free slot in the KV cache - if (!try_clear_idle_slots()) { - n_batch /= 2; + // stop, do not retry with smaller batch size + throw std::runtime_error(err); } - - SRV_WRN("failed to find free space in the KV cache, retrying with smaller batch size, i = %d, n_batch = %d, ret = %d\n", i, n_batch, ret); - - continue; // continue loop of n_batch } - // TODO: avoid restoring the draft context and re-evaluating the drafted tokens when not needed [TAG_SPEC_AVOID_DRAFT_REEVAL] - // for now, always re-evaluate for simplicity - // ref: https://github.com/ggml-org/llama.cpp/pull/22728#issuecomment-4400925384 - // - // | spec type | need re-eval | - // | --- | --- | - // | draft model | no | because the draft model does not use embeddings from the target - // | MTP (std) | yes | - // | MTP Gemma4 | no | because the KV cache is shared - // | Eagle3 | yes | - // | DFlash | yes | https://github.com/ggml-org/llama.cpp/pull/22728#issuecomment-4405406982 - // - // note: this logic is now moved in `common_speculative_process()` - // keeping the sketch here until for a bit, until the logic is finalized - // - //if (ctx_dft) { - // // TODO: update as needed for MTP, Eagle3, etc. - // const bool need_tgt_embd = false; - - // if (need_tgt_embd) { - // llama_synchronize(ctx_tgt); - // } - - // // the logic here varies depending on the speculative decoding method - // // - some draft contexts require embeddings from the target context, others don't - // // - some draft contexts involve an encoder step to transform the target embeddings to draft embeddings - // // TODO: extract this in a function ? - // { - // // TODO: hook the embeddings from the last target batch here - // if (llama_model_has_encoder(model_dft.get())) { - // //llama_encode(ctx_dft, ...); - - // GGML_ABORT("not implemented yet\n"); - // } - - // const int ret = llama_decode(ctx_dft.get(), batch_view); - - // if (ret != 0) { - // SRV_ERR("failed to decode draft batch, ret = %d\n", ret); - - // // TODO: handle error - // break; - // } - // } - //} - if (!common_speculative_process(spec.get(), batch_view)) { - SRV_ERR("%s", "failed to process speculative batch\n"); - - // TODO: handle error - break; + // retry with half the batch size to try to find a free slot in the KV cache + if (!try_clear_idle_slots()) { + n_batch /= 2; } - // move the head of the batch forward with the number of tokens we just processed - i_next = i + n_tokens; + SRV_WRN("failed to find free space in the KV cache, retrying with smaller batch size, off = %d, n_batch = %d, ret = %d\n", off, n_batch, ret); - // on successful decode, restore the original batch size - n_batch = llama_n_batch(ctx_tgt); + return false; // retry with the updated n_batch + } - // handle `n_cmpl > 1` tasks - when the main prompt is processed, activate all child tasks too - for (auto & slot : slots) { - if (slot.state == SLOT_STATE_DONE_PROMPT && slot.task->is_parent()) { - std::vector children; - for (auto & other : slots) { - if (other.state == SLOT_STATE_WAIT_OTHER && slot.task->id == other.task->id_parent) { - children.push_back(&other); - } + // TODO: avoid restoring the draft context and re-evaluating the drafted tokens when not needed [TAG_SPEC_AVOID_DRAFT_REEVAL] + // for now, always re-evaluate for simplicity + // ref: https://github.com/ggml-org/llama.cpp/pull/22728#issuecomment-4400925384 + if (!common_speculative_process(spec.get(), batch_view)) { + SRV_ERR("%s", "failed to process speculative batch\n"); + + // TODO: handle error + throw std::runtime_error("failed to process speculative batch"); + } + + // handle `n_cmpl > 1` tasks - when the main prompt is processed, activate all child tasks too + for (auto & slot : slots) { + if (slot.state == SLOT_STATE_DONE_PROMPT && slot.task->is_parent()) { + std::vector children; + for (auto & other : slots) { + if (other.state == SLOT_STATE_WAIT_OTHER && slot.task->id == other.task->id_parent) { + children.push_back(&other); } + } - // all children slots should already launched by launch_slots_with_parent_task() - // copy state to the child slots - for (auto & child : children) { - SLT_INF(slot, " - copying state to child %d\n", child->id); + // all children slots should already launched by launch_slots_with_parent_task() + // copy state to the child slots + for (auto & child : children) { + SLT_INF(slot, " - copying state to child %d\n", child->id); - GGML_ASSERT(child->state == SLOT_STATE_WAIT_OTHER); + GGML_ASSERT(child->state == SLOT_STATE_WAIT_OTHER); - slot.copy_state_to(*child); - child->state = SLOT_STATE_DONE_PROMPT; - } + slot.copy_state_to(*child); + child->state = SLOT_STATE_DONE_PROMPT; } } + } - for (auto & slot : slots) { - // optionally send prompt processing progress - if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_DONE_PROMPT) { - if (slot.task->params.stream && slot.task->params.return_progress) { - send_partial_response(slot, {}, true); - } - } + return true; + } - if (slot.i_batch < (int) i || slot.i_batch >= (int) (i + n_tokens)) { - continue; // continue loop of slots - } + void post_decode(int32_t n_batch_tokens, int32_t off, llama_batch & batch_view) { + // for checking if a given batch index is inside batch_view + auto is_inside_view = [&](int32_t idx) { + return idx >= off && idx < off + n_batch_tokens; + }; - if (slot.state == SLOT_STATE_DONE_PROMPT) { - if (slot.task->type == SERVER_TASK_TYPE_EMBEDDING) { - // prompt evaluated for embedding - send_embedding(slot, batch_view); - slot.release(); - slot.i_batch = -1; - continue; // continue loop of slots - } + // TODO @ngxson : it's tricky to make sub-batch compatible with common_sampler_sample_and_accept_n, + // so for now we will throw an error in this case: https://github.com/ggml-org/llama.cpp/issues/24840 + iterate(slots, [&](server_slot & slot) { + for (auto & i : slot.spec_i_batch) { + if (!is_inside_view(i)) { + throw std::runtime_error(string_format("speculative batch index %d is not inside the current sub-batch [%d, %d)", i, off, off + n_batch_tokens)); + } + } + }); - if (slot.task->type == SERVER_TASK_TYPE_RERANK) { - send_rerank(slot, batch_view); - slot.release(); - slot.i_batch = -1; - continue; // continue loop of slots - } + auto accept_special_token = [&](server_slot & slot, llama_token token) { + return params_base.special || + slot.task->params.sampling.preserved_tokens.find(token) != slot.task->params.sampling.preserved_tokens.end(); + }; - GGML_ASSERT(slot.task->need_sampling()); + iterate(slots, [&](server_slot & slot) { + // optionally send prompt processing progress + if (slot.state == SLOT_STATE_PROCESSING_PROMPT || slot.state == SLOT_STATE_DONE_PROMPT) { + if (slot.task->params.stream && slot.task->params.return_progress) { + send_partial_response(slot, {}, true); + } + } - // prompt evaluated for next-token prediction - slot.state = SLOT_STATE_GENERATING; + if (!is_inside_view(slot.i_batch)) { + // the required token not in this sub-batch, skip + return; + } - if (slot.can_speculate()) { - common_speculative_begin(spec.get(), slot.id, slot.prompt.tokens.get_text_tokens()); - } - } else if (slot.state != SLOT_STATE_GENERATING) { - continue; // continue loop of slots + if (slot.state == SLOT_STATE_DONE_PROMPT) { + if (slot.task->type == SERVER_TASK_TYPE_EMBEDDING) { + // prompt evaluated for embedding + send_embedding(slot, batch_view); + slot.release(); + slot.i_batch = -1; + return; } - if (slot.can_speculate() && !slot.spec_draft.empty()) { - continue; // sample using speculative decoding + if (slot.task->type == SERVER_TASK_TYPE_RERANK) { + send_rerank(slot, batch_view); + slot.release(); + slot.i_batch = -1; + return; } - const int tok_idx = slot.i_batch - i; + GGML_ASSERT(slot.task->need_sampling()); - llama_token id = common_sampler_sample(slot.smpl.get(), slot.ctx_tgt, tok_idx); + // prompt evaluated for next-token prediction + slot.state = SLOT_STATE_GENERATING; - slot.i_batch = -1; + if (slot.can_speculate()) { + common_speculative_begin(spec.get(), slot.id, slot.prompt.tokens.get_text_tokens()); + } + } else if (slot.state != SLOT_STATE_GENERATING) { + return; + } - common_sampler_accept(slot.smpl.get(), id, true); + if (slot.can_speculate() && !slot.spec_draft.empty()) { + return; // sample using speculative decoding + } - // here we have synchronized the llama_context (due to the sampling above), so we can do time measurement - const int64_t t_current = ggml_time_us(); + // shifted according to the current sub-batch + const int tok_idx = slot.i_batch - off; - slot.n_decoded += 1; + llama_token id; + { + scoped_timer timer(t_sampl, n_sampl); + id = common_sampler_sample(slot.smpl.get(), slot.ctx_tgt, tok_idx); + } - if (slot.n_decoded == 1) { - slot.t_start_generation = t_current; - slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3; - metrics.on_prompt_eval(slot); - } + slot.i_batch = -1; - slot.t_token_generation = std::max(1, t_current - slot.t_start_generation) / 1e3; + common_sampler_accept(slot.smpl.get(), id, true); - completion_token_output result; - result.tok = id; - result.text_to_send = common_token_to_piece(slot.ctx_tgt, result.tok, accept_special_token(slot, result.tok)); - result.prob = 1.0f; // TODO: set it here instead of doing inside populate_token_probs + // here we have synchronized the llama_context (due to the sampling above), so we can do time measurement + const int64_t t_now = ggml_time_us(); - if (slot.task->params.sampling.n_probs > 0) { - populate_token_probs(slot, result, slot.task->params.post_sampling_probs, params_base.special, tok_idx); - } + slot.n_decoded += 1; - if (!process_token(result, slot)) { - // release slot because of stop condition - slot.print_timings(); - send_final_response(slot); - metrics.on_prediction(slot); - slot.release(); + if (slot.n_decoded == 1) { + slot.t_start_generation = t_now; + slot.t_print_last = t_now; + slot.n_decoded_last = 0; + slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3; + metrics.on_prompt_eval(slot); + } - continue; - } + slot.t_token_generation = std::max(1, t_now - slot.t_start_generation) / 1e3; + + completion_token_output result; + result.tok = id; + result.text_to_send = common_token_to_piece(slot.ctx_tgt, result.tok, accept_special_token(slot, result.tok)); + result.prob = 1.0f; // TODO: set it here instead of doing inside populate_token_probs - slot.print_timings_tg(); + if (slot.task->params.sampling.n_probs > 0) { + populate_token_probs(slot, result, slot.task->params.post_sampling_probs, params_base.special, tok_idx); } - // speculative decoding - main model sample and accept - for (auto & slot : slots) { - if (slot.state != SLOT_STATE_GENERATING || !slot.can_speculate() || slot.spec_draft.empty()) { - continue; - } + if (!process_token(result, slot)) { + // release slot because of stop condition + slot.print_timings(); + send_final_response(slot); + metrics.on_prediction(slot); + slot.release(); - // save the original draft size - const size_t n_draft = slot.spec_draft.size(); + return; + } - GGML_ASSERT(n_draft > 0); + slot.print_timings_tg(); + }); - // verify and try to accept the draft - { - // save the sampler sampler state in case we need to restore it - common_sampler_ptr smpl_save(common_sampler_clone(slot.smpl.get())); + // speculative decoding - main model sample and accept + iterate(slots, [&](server_slot & slot) { + if (slot.state != SLOT_STATE_GENERATING || !slot.can_speculate() || slot.spec_draft.empty()) { + return; + } - GGML_ASSERT(slot.spec_i_batch.size() == n_draft + 1); - auto accepted = common_sampler_sample_and_accept_n(slot.smpl.get(), slot.ctx_tgt, slot.spec_i_batch, slot.spec_draft); - slot.spec_i_batch.clear(); + // save the original draft size + const size_t n_draft = slot.spec_draft.size(); - GGML_ASSERT(accepted.size() >= 1); + GGML_ASSERT(n_draft > 0); - const uint32_t n_rollback = slot.spec_draft.size() + 1 - accepted.size(); + // verify and try to accept the draft + { + // save the sampler sampler state in case we need to restore it + common_sampler_ptr smpl_save(common_sampler_clone(slot.smpl.get())); - const bool use_ckpt_tgt = - ctx_tgt_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL || - (ctx_tgt_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_RS && n_rollback > llama_n_rs_seq(ctx_tgt)); + GGML_ASSERT(slot.spec_i_batch.size() == n_draft + 1); + auto accepted = common_sampler_sample_and_accept_n(slot.smpl.get(), slot.ctx_tgt, slot.spec_i_batch, slot.spec_draft); + slot.spec_i_batch.clear(); - // check for partial draft acceptance - if (n_rollback > 0) { - if (use_ckpt_tgt) { - if (trace > 0) { - SLT_INF(slot, "accepted %2zu/%2zu draft tokens (restore checkpoint)\n", accepted.size() - 1, slot.spec_draft.size()); - } + GGML_ASSERT(accepted.size() >= 1); - // partial acceptance is not supported by the context -> truncate the draft and restore the state - slot.spec_draft = std::move(accepted); + const uint32_t n_rollback = slot.spec_draft.size() + 1 - accepted.size(); - const auto & ckpt = slot.spec_ckpt; + const bool use_ckpt_tgt = + ctx_tgt_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_FULL || + (ctx_tgt_seq_rm_type == COMMON_CONTEXT_SEQ_RM_TYPE_RS && n_rollback > llama_n_rs_seq(ctx_tgt)); - SLT_DBG(slot, "restoring speculative checkpoint (pos_min = %d, pos_max = %d, size = %zu)\n", ckpt.pos_min, ckpt.pos_max, ckpt.size()); + // check for partial draft acceptance + if (n_rollback > 0) { + if (use_ckpt_tgt) { + if (trace > 0) { + SLT_INF(slot, "accepted %2zu/%2zu draft tokens (restore checkpoint)\n", accepted.size() - 1, slot.spec_draft.size()); + } - { - ckpt.load_tgt(slot.ctx_tgt, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); + // partial acceptance is not supported by the context -> truncate the draft and restore the state + slot.spec_draft = std::move(accepted); - common_context_seq_rm(slot.ctx_tgt, slot.id, ckpt.pos_max + 1, -1); - } + const auto & ckpt = slot.spec_ckpt; - if (slot.ctx_dft) { - ckpt.load_dft(slot.ctx_dft, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); + SLT_DBG(slot, "restoring speculative checkpoint (pos_min = %d, pos_max = %d, size = %zu)\n", ckpt.pos_min, ckpt.pos_max, ckpt.size()); - common_context_seq_rm(slot.ctx_dft, slot.id, ckpt.pos_max + 1, -1); - } + { + ckpt.load_tgt(slot.ctx_tgt, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); - slot.prompt.tokens.keep_first(ckpt.n_tokens); - slot.smpl = std::move(smpl_save); + common_context_seq_rm(slot.ctx_tgt, slot.id, ckpt.pos_max + 1, -1); + } - continue; + if (slot.ctx_dft) { + ckpt.load_dft(slot.ctx_dft, slot.id, LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY); + + common_context_seq_rm(slot.ctx_dft, slot.id, ckpt.pos_max + 1, -1); } - } - if (trace > 0) { - SLT_INF(slot, "accepted %2zu/%2zu draft tokens\n", accepted.size() - 1, n_draft); - } + slot.prompt.tokens.keep_first(ckpt.n_tokens); + slot.smpl = std::move(smpl_save); - common_speculative_accept(spec.get(), slot.id, accepted.size() - 1); + return; + } + } - slot.spec_draft = std::move(accepted); + if (trace > 0) { + SLT_INF(slot, "accepted %2zu/%2zu draft tokens\n", accepted.size() - 1, n_draft); } - const int64_t t_current = ggml_time_us(); + common_speculative_accept(spec.get(), slot.id, accepted.size() - 1); - const auto ids = std::move(slot.spec_draft); + slot.spec_draft = std::move(accepted); + } - slot.t_token_generation = std::max(1, t_current - slot.t_start_generation) / 1e3; + const int64_t t_now = ggml_time_us(); - // update how many tokens out of those tested were accepted - slot.n_draft_accepted += ids.size() - 1; + const auto ids = std::move(slot.spec_draft); - // add accepted tokens to the prompt - slot.prompt.tokens.keep_first(slot.prompt.n_tokens() - n_draft); - slot.prompt.tokens.insert({ids.begin(), ids.end() - 1}); + slot.t_token_generation = std::max(1, t_now - slot.t_start_generation) / 1e3; - slot.sampled = ids.back(); // last accepted token - SLT_DBG(slot, "add accepted tokens: sampled=%d, ids.size=%zu, n_draft=%zu\n", slot.sampled, ids.size(), n_draft); + // update how many tokens out of those tested were accepted + slot.n_draft_accepted += ids.size() - 1; + slot.n_draft_verif_steps += 1; - common_context_seq_rm(slot.ctx_tgt, slot.id, slot.prompt.tokens.pos_next(), -1); - if (slot.ctx_dft) { - common_context_seq_rm(slot.ctx_dft, slot.id, slot.prompt.tokens.pos_next(), -1); - } + if (slot.n_accepted_per_pos.empty()) { + slot.n_accepted_per_pos.resize(common_speculative_n_max(¶ms_base.speculative), 0); + } + for (size_t i = 0; i < ids.size() - 1 && i < slot.n_accepted_per_pos.size(); ++i) { + slot.n_accepted_per_pos[i]++; + } - for (size_t i = 0; i < ids.size(); ++i) { - completion_token_output result; + // add accepted tokens to the prompt + slot.prompt.tokens.keep_first(slot.prompt.n_tokens() - n_draft); + slot.prompt.tokens.insert({ids.begin(), ids.end() - 1}); - result.tok = ids[i]; - result.text_to_send = common_token_to_piece(slot.ctx_tgt, result.tok, accept_special_token(slot, result.tok)); - result.prob = 1.0f; // set later + slot.sampled = ids.back(); // last accepted token + SLT_DBG(slot, "add accepted tokens: sampled=%d, ids.size=%zu, n_draft=%zu\n", slot.sampled, ids.size(), n_draft); - // TODO: set result.probs + common_context_seq_rm(slot.ctx_tgt, slot.id, slot.prompt.tokens.pos_next(), -1); + if (slot.ctx_dft) { + common_context_seq_rm(slot.ctx_dft, slot.id, slot.prompt.tokens.pos_next(), -1); + } - slot.n_decoded += 1; + for (size_t i = 0; i < ids.size(); ++i) { + completion_token_output result; - if (!process_token(result, slot)) { - slot.print_timings(); - send_final_response(slot); - metrics.on_prediction(slot); - slot.release(); + result.tok = ids[i]; + result.text_to_send = common_token_to_piece(slot.ctx_tgt, result.tok, accept_special_token(slot, result.tok)); + result.prob = 1.0f; // set later - break; - } - } + // TODO: set result.probs - slot.print_timings_tg(); + slot.n_decoded += 1; - SLT_DBG(slot, "accepted %d/%d draft tokens, new n_tokens = %d\n", (int) ids.size() - 1, (int) n_draft, slot.prompt.n_tokens()); + if (!process_token(result, slot)) { + slot.print_timings(); + send_final_response(slot); + metrics.on_prediction(slot); + slot.release(); + + return; + } } - } - SRV_DBG("%s", "run slots completed\n"); + slot.print_timings_tg(); + + SLT_DBG(slot, "accepted %d/%d draft tokens, new n_tokens = %d\n", (int) ids.size() - 1, (int) n_draft, slot.prompt.n_tokens()); + }); } int get_slot_n_ctx() { @@ -3593,7 +3983,6 @@ server_context_meta server_context::get_meta() const { /* has_inp_audio */ impl->chat_params.allow_audio, /* has_inp_video */ impl->chat_params.allow_video, /* json_ui_settings */ impl->json_ui_settings, - /* json_webui_settings */ impl->json_webui_settings, // Deprecated /* slot_n_ctx */ impl->get_slot_n_ctx(), /* pooling_type */ llama_pooling_type(impl->ctx_tgt), @@ -3634,6 +4023,15 @@ struct server_res_generator : server_http_res { queue_tasks.wait_until_no_sleep(); } } + ~server_res_generator() override { + // cleanup() must run while rd is still alive (rd is destroyed after this body returns) + if (spipe) { + spipe->cleanup(); + } + } + void stop() override { + rd.stop(); + } void ok(const json & response_data) { status = 200; data = safe_json_to_str(response_data); @@ -3644,58 +4042,16 @@ struct server_res_generator : server_http_res { } }; -void server_context::on_sleeping_changed(std::function callback) { - impl->queue_tasks.on_sleeping_state(std::move(callback)); -} - -// compute the number of tokens before the last user message in the prompt -static int32_t prompt_get_n_before_user( - const json & message_spans, - const std::string & prompt, - const std::vector & files, - const llama_vocab * vocab, - mtmd_context * mctx) { - int32_t result = -1; - int32_t byte_pos = -1; - - for (const auto & span : message_spans) { - const std::string role = json_value(span, "role", std::string()); - - if (role == "user") { - byte_pos = json_value(span, "pos", -1); - } - } - - if (byte_pos >= 0) { - GGML_ASSERT((size_t) byte_pos <= prompt.size()); - - const std::string prefix = prompt.substr(0, (size_t) byte_pos); - - const std::string marker = get_media_marker(); - size_t n_prefix_media = 0; - for (size_t pos = 0; (pos = prefix.find(marker, pos)) != std::string::npos; pos += marker.size()) { - n_prefix_media++; +void server_context::set_state_callback(server_state_callback_t callback) { + impl->callback_state = std::move(callback); + impl->queue_tasks.on_sleeping_state([this](bool sleeping) { + if (sleeping) { + impl->callback_state(SERVER_STATE_SLEEPING, {}); } - - GGML_ASSERT(n_prefix_media <= files.size()); - - if (mctx != nullptr && n_prefix_media > 0) { - // TODO: this makes a copy - avoid it - std::vector prefix_files(files.begin(), files.begin() + n_prefix_media); - - result = (int32_t) process_mtmd_prompt(mctx, prefix, prefix_files).size(); - } else { - result = (int32_t) tokenize_input_prompts(vocab, nullptr, prefix, true, true)[0].size(); - } - - SRV_TRC("message_spans: last user message: byte_pos=%d, media=%zu, n_before_user=%d\n", - byte_pos, n_prefix_media, result); - } - - return result; + // for sleeping == false, event is emitted by load_model() + }); } - // // server_routes // @@ -3743,29 +4099,24 @@ std::unique_ptr server_routes::handle_completions_impl( // tasks.reserve(inputs.size()); // TODO: this is inaccurate due to child tasks + // message delimiters for checkpointing + auto delimiters = common_chat_msg_delimiters_parse(json_value(data, "message_delimiters", json::array())); + delimiters.tokenize(ctx_server.vocab); + for (size_t i = 0; i < inputs.size(); i++) { server_task task = server_task(type); task.id = rd.get_new_id(); task.tokens = std::move(inputs[i]); - task.params = server_task::params_from_json_cmpl( + task.params = server_schema::eval_llama_cmpl_schema( ctx_server.vocab, params, meta->slot_n_ctx, meta->logit_bias_eog, data); - const auto message_spans = json_value(data, "message_spans", json::array()); - if (prompt.is_string() && message_spans.is_array()) { - task.params.n_before_user = - prompt_get_n_before_user( - message_spans, - prompt.get(), - files, - ctx_server.vocab, - ctx_server.mctx); - } + task.params.message_spans = task.tokens.find_message_spans(delimiters); task.id_slot = json_value(data, "id_slot", -1); @@ -3869,8 +4220,10 @@ std::unique_ptr server_routes::handle_completions_impl( } }; + auto effective_should_stop = stream_aware_should_stop(res_this, req.should_stop); + try { - if (req.should_stop()) { + if (effective_should_stop()) { SRV_DBG("%s", "stopping streaming due to should_stop condition\n"); return false; // should_stop condition met } @@ -3904,8 +4257,8 @@ std::unique_ptr server_routes::handle_completions_impl( // receive subsequent results bool timeout = false; int64_t start_time = ggml_time_ms(); - auto result = rd.next([&timeout, &req, &start_time, ¶ms]() { - if (req.should_stop()) { + auto result = rd.next([&timeout, &start_time, ¶ms, &effective_should_stop]() { + if (effective_should_stop()) { return true; // should_stop condition met } else if (params.sse_ping_interval > 0 && ggml_time_ms() - start_time > (int64_t)params.sse_ping_interval * 1000) { timeout = true; @@ -3923,7 +4276,7 @@ std::unique_ptr server_routes::handle_completions_impl( if (result == nullptr) { SRV_DBG("%s", "stopping streaming due to should_stop condition\n"); - GGML_ASSERT(req.should_stop()); + GGML_ASSERT(effective_should_stop()); return false; // should_stop condition met } @@ -3961,6 +4314,10 @@ std::unique_ptr server_routes::handle_completions_impl( }; } + // attach a producer pipe to the response when X-Conversation-Id is present. + // the pipe mirrors SSE chunks into the ring buffer and wires up the cancel hook. + stream_session_attach_pipe(*res, req.headers); + return res; } @@ -4206,19 +4563,15 @@ void server_routes::init_routes() { { "endpoint_slots", params.endpoint_slots }, { "endpoint_props", params.endpoint_props }, { "endpoint_metrics", params.endpoint_metrics }, - // New keys - { "ui", params.ui }, - { "ui_settings", meta->json_ui_settings }, - // Deprecated: use ui/ui_settings instead (kept for backward compat) - { "webui", params.webui }, - { "webui_settings", meta->json_webui_settings }, + { "ui", params.ui }, + { "ui_settings", meta->json_ui_settings }, { "chat_template", tmpl_default }, { "chat_template_caps", meta->chat_template_caps }, { "bos_token", meta->bos_token_str }, { "eos_token", meta->eos_token_str }, { "build_info", meta->build_info }, { "is_sleeping", queue_tasks.is_sleeping() }, - { "cors_proxy_enabled", params.ui_mcp_proxy || params.webui_mcp_proxy }, + { "cors_proxy_enabled", params.ui_mcp_proxy }, }; if (params.use_jinja) { if (!tmpl_tools.empty()) { diff --git a/tools/server/server-context.h b/tools/server/server-context.h index 0e84785af497..952f825f7245 100644 --- a/tools/server/server-context.h +++ b/tools/server/server-context.h @@ -22,8 +22,7 @@ struct server_context_meta { bool has_inp_image; bool has_inp_audio; bool has_inp_video; - json json_ui_settings; // Primary: new name - json json_webui_settings; // Deprecated: use json_ui_settings instead (kept for backward compat) + json json_ui_settings; int slot_n_ctx; enum llama_pooling_type pooling_type; @@ -53,6 +52,33 @@ struct server_context_meta { uint64_t model_size; }; +enum server_state { + SERVER_STATE_DOWNLOADING, + SERVER_STATE_LOADING, + SERVER_STATE_READY, + SERVER_STATE_SLEEPING, +}; + +static std::string server_state_to_str(server_state state) { + switch (state) { + case SERVER_STATE_DOWNLOADING: return "downloading"; + case SERVER_STATE_LOADING: return "loading"; + case SERVER_STATE_READY: return "ready"; + case SERVER_STATE_SLEEPING: return "sleeping"; + default: GGML_ASSERT(false && "invalid server_state"); + } +} + +static server_state server_state_from_str(const std::string & str) { + if (str == "downloading") return SERVER_STATE_DOWNLOADING; + if (str == "loading") return SERVER_STATE_LOADING; + if (str == "ready") return SERVER_STATE_READY; + if (str == "sleeping") return SERVER_STATE_SLEEPING; + GGML_ASSERT(false && "invalid server_state string"); +} + +using server_state_callback_t = std::function; + struct server_context { std::unique_ptr impl; @@ -80,9 +106,8 @@ struct server_context { // not thread-safe, should only be used from the main thread server_context_meta get_meta() const; - // register a callback to be called when sleeping state changes - // must be set before load_model() is called - void on_sleeping_changed(std::function callback); + // note: must be set before load_model() is called + void set_state_callback(server_state_callback_t callback); }; diff --git a/tools/server/server-cors-proxy.h b/tools/server/server-cors-proxy.h index 2af0c7e1c21b..53a6909ed2f2 100644 --- a/tools/server/server-cors-proxy.h +++ b/tools/server/server-cors-proxy.h @@ -7,9 +7,18 @@ #include #include #include +#include +#include #include "server-http.h" +static std::string proxy_header_to_lower(std::string header) { + std::transform(header.begin(), header.end(), header.begin(), [](unsigned char c) { + return std::tolower(c); + }); + return header; +} + static server_http_res_ptr proxy_request(const server_http_req & req, std::string method) { std::string target_url = req.get_param("url"); common_http_url parsed_url = common_http_parse_url(target_url); @@ -33,11 +42,18 @@ static server_http_res_ptr proxy_request(const server_http_req & req, std::strin SRV_INF("proxying %s request to %s://%s:%i%s\n", method.c_str(), parsed_url.scheme.c_str(), parsed_url.host.c_str(), parsed_url.port, parsed_url.path.c_str()); std::map headers; + const std::string proxy_header_prefix = "x-llama-server-proxy-header-"; for (auto [key, value] : req.headers) { - auto new_key = key; - if (string_starts_with(new_key, "x-proxy-header-")) { - string_replace_all(new_key, "x-proxy-header-", ""); + const std::string lowered_key = proxy_header_to_lower(key); + if (!string_starts_with(lowered_key, proxy_header_prefix)) { + continue; } + + auto new_key = key.substr(proxy_header_prefix.size()); + if (new_key.empty()) { + continue; + } + headers[new_key] = value; } diff --git a/tools/server/server-http.cpp b/tools/server/server-http.cpp index 34a20c9d22de..82f34edac069 100644 --- a/tools/server/server-http.cpp +++ b/tools/server/server-http.cpp @@ -1,5 +1,6 @@ #include "common.h" #include "server-http.h" +#include "server-stream.h" #include "server-common.h" #include "ui.h" @@ -113,7 +114,7 @@ bool server_http_context::init(const common_params & params) { #endif srv->set_default_headers({{"Server", "llama.cpp"}}); - srv->set_logger(log_server_request); + // srv->set_logger(log_server_request); // TODO @ngxson : this is too spamy, no very useful; improve it in the future srv->set_exception_handler([](const httplib::Request &, httplib::Response & res, const std::exception_ptr & ep) { // this is fail-safe; exceptions should already handled by `ex_wrapper` @@ -173,25 +174,29 @@ bool server_http_context::init(const common_params & params) { // Middlewares // - auto middleware_validate_api_key = [api_keys = params.api_keys](const httplib::Request & req, httplib::Response & res) { - static const std::unordered_set public_endpoints = { + // Public endpoints - API routes plus all embedded UI assets + static const std::unordered_set get_public_endpoints = []() { + std::unordered_set endpoints { "/health", "/v1/health", "/models", "/v1/models", "/", - "/index.html", - "/bundle.js", - "/bundle.css", }; + for (const llama_ui_asset & a : llama_ui_get_assets()) { + endpoints.insert("/" + a.name); + } + return endpoints; + }(); + auto middleware_validate_api_key = [api_keys = params.api_keys](const httplib::Request & req, httplib::Response & res) { // If API key is not set, skip validation if (api_keys.empty()) { return true; } - // If path is public or static file, skip validation - if (public_endpoints.find(req.path) != public_endpoints.end()) { + // If path is public or a UI asset, skip validation + if (get_public_endpoints.count(req.path)) { return true; } @@ -315,33 +320,84 @@ bool server_http_context::init(const common_params & params) { } } else { #if defined(LLAMA_UI_HAS_ASSETS) - auto serve_asset = [](const std::string & name, const char * mime, bool with_isolation_headers) { - return [name, mime, with_isolation_headers](const httplib::Request & req, httplib::Response & res) { - const llama_ui_asset * a = llama_ui_find_asset(name.c_str()); - if (!a) { - res.status = 404; - return false; + static auto handle_gzip_header = [](const httplib::Request & req, httplib::Response & res) { + if (!llama_ui_use_gzip()) { + // no gzip build, skip + return true; + } + if (req.get_header_value("Accept-Encoding").find("gzip") == std::string::npos) { + res.status = 415; // unsupported media type + res.set_content("Error: gzip is not supported by this browser", "text/plain"); + return false; + } else { + res.set_header("Content-Encoding", "gzip"); + } + return true; + }; + + auto serve_asset_cached = [](const std::string & name, bool isolation) { + return [name, isolation](const httplib::Request & req, httplib::Response & res) { + if (!handle_gzip_header(req, res)) { + return true; // returns error message } + const llama_ui_asset * a = llama_ui_find_asset(name); + if (!a) { res.status = 404; return false; } res.set_header("ETag", a->etag); - // Check If-None-Match for conditional GET (304 Not Modified) if (const std::string & inm = req.get_header_value("If-None-Match"); !inm.empty() && (inm == a->etag || inm == std::string("W/") + a->etag)) { res.status = 304; return false; } - if (with_isolation_headers) { - // COEP and COOP headers, required by pyodide (python interpreter) + if (isolation) { res.set_header("Cross-Origin-Embedder-Policy", "require-corp"); - res.set_header("Cross-Origin-Opener-Policy", "same-origin"); + res.set_header("Cross-Origin-Opener-Policy", "same-origin"); } - res.set_content(reinterpret_cast(a->data), a->size, mime); + res.set_header("Cache-Control", "public, max-age=31536000, immutable"); + res.set_content(reinterpret_cast(a->data), a->size, a->type.c_str()); return false; }; }; - srv->Get(params.api_prefix + "/", serve_asset("index.html", "text/html; charset=utf-8", true)); - srv->Get(params.api_prefix + "/bundle.js", serve_asset("bundle.js", "application/javascript; charset=utf-8", false)); - srv->Get(params.api_prefix + "/bundle.css", serve_asset("bundle.css", "text/css; charset=utf-8", false)); + auto serve_asset_nocache = [](const std::string & name) { + return [name](const httplib::Request & req, httplib::Response & res) { + if (!handle_gzip_header(req, res)) { + return true; // returns error message + } + const llama_ui_asset * a = llama_ui_find_asset(name); + if (!a) { + res.status = 404; + return false; + } + res.set_header("Cache-Control", "no-cache"); + res.set_content(reinterpret_cast(a->data), a->size, a->type.c_str()); + return false; + }; + }; + + // main index file + srv->Get(params.api_prefix + "/", serve_asset_cached("index.html", true)); + srv->Get(params.api_prefix + "/index.html", serve_asset_cached("index.html", true)); + + // All remaining assets registered directly from the embedded asset table. + // PWA revalidation files (sw.js, manifest, version.json) use no-cache; + // everything else is immutable. + static const std::unordered_set no_cache_names = { + "sw.js", + "manifest.webmanifest", + "_app/version.json", + "build.json" + }; + + for (const auto & a : llama_ui_get_assets()) { + if (a.name == "index.html") continue; // served at "/" and "/index.html" above + if (no_cache_names.count(a.name)) { + SRV_DBG("serve nocache for %s\n", a.name.c_str()); + srv->Get(params.api_prefix + "/" + a.name, serve_asset_nocache(a.name)); + } else { + srv->Get(params.api_prefix + "/" + a.name, serve_asset_cached(a.name, false)); + } + } + #endif } } @@ -401,13 +457,40 @@ static void set_headers(httplib::Response & res, const std::map int { + if (c >= '0' && c <= '9') return c - '0'; + if (c >= 'a' && c <= 'f') return c - 'a' + 10; + if (c >= 'A' && c <= 'F') return c - 'A' + 10; + return -1; + }; + int hi = hex(in[i + 1]); + int lo = hex(in[i + 2]); + if (hi >= 0 && lo >= 0) { + out.push_back(char((hi << 4) | lo)); + i += 2; + continue; + } + } + out.push_back(in[i]); + } + return out; +} + static std::map get_params(const httplib::Request & req) { std::map params; for (const auto & [key, value] : req.params) { params[key] = value; } for (const auto & [key, value] : req.path_params) { - params[key] = value; + params[key] = decode_path_component(value); } return params; } @@ -437,29 +520,46 @@ using server_http_req_ptr = std::unique_ptr; static void process_handler_response(server_http_req_ptr && request, server_http_res_ptr & response, httplib::Response & res) { if (response->is_stream()) { res.status = response->status; + // Tell Nginx to not buffer any streamed response + response->headers["X-Accel-Buffering"] = "no"; set_headers(res, response->headers); const std::string content_type = response->content_type; // convert to shared_ptr as both chunked_content_provider() and on_complete() need to use it - std::shared_ptr q_ptr = std::move(request); - std::shared_ptr r_ptr = std::move(response); - const auto chunked_content_provider = [response = r_ptr](size_t, const httplib::DataSink & sink) -> bool { + std::shared_ptr q_ptr = std::move(request); + std::shared_ptr r_ptr = std::move(response); + + const auto chunked_content_provider = [response = r_ptr](size_t, httplib::DataSink & sink) -> bool { std::string chunk; const bool has_next = response->next(chunk); if (!chunk.empty()) { + // mirror into the ring buffer first, the session must reflect every SSE chunk + // whether or not the wire write below succeeds + if (response->spipe) { + response->spipe->write(chunk.data(), chunk.size()); + } if (!sink.write(chunk.data(), chunk.size())) { + // peer is gone, stop the wire path here return false; } SRV_DBG("http: streamed chunk: %s\n", chunk.c_str()); } if (!has_next) { + // producer reached its natural end on the wire, a later close() skips the drain + if (response->spipe) { + response->spipe->done(); + } sink.done(); SRV_DBG("%s", "http: stream ended\n"); } return has_next; }; const auto on_complete = [request = q_ptr, response = r_ptr](bool) mutable { - response.reset(); // trigger the destruction of the response object - request.reset(); // trigger the destruction of the request object + // on a dropped peer, close() drains the rest of the generation into the ring buffer + if (response->spipe) { + response->spipe->close(); + } + response.reset(); // spipe destructor finalizes the session if attached + request.reset(); }; res.set_chunked_content_provider(content_type, chunked_content_provider, on_complete); } else { @@ -533,6 +633,23 @@ void server_http_context::post(const std::string & path, const server_http_conte }); } +void server_http_context::del(const std::string & path, const server_http_context::handler_t & handler) const { + handlers.emplace(path, handler); + pimpl->srv->Delete(path_prefix + path, [handler](const httplib::Request & req, httplib::Response & res) { + server_http_req_ptr request = std::make_unique(server_http_req{ + get_params(req), + get_headers(req), + req.path, + build_query_string(req), + req.body, + {}, + req.is_connection_closed + }); + server_http_res_ptr response = handler(*request); + process_handler_response(std::move(request), response, res); + }); +} + // // Vertex AI Prediction protocol (AIP_PREDICT_ROUTE) // https://cloud.google.com/vertex-ai/docs/predictions/custom-container-requirements diff --git a/tools/server/server-http.h b/tools/server/server-http.h index 25c7f10629b3..350813183671 100644 --- a/tools/server/server-http.h +++ b/tools/server/server-http.h @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -10,6 +11,7 @@ #include struct common_params; +struct stream_pipe_producer; // defined in server-stream.h // generator-like API for HTTP response generation // this object response with one of the 2 modes: @@ -23,12 +25,20 @@ struct server_http_res { std::string data; std::map headers; - // TODO: move this to a virtual function once we have proper polymorphism support + // if set, the stream survives a client disconnect: the producer pipe keeps draining into the + // ring buffer and finalizes the session on destruction, so no explicit on_stream_end is needed. + // shared_ptr (not unique_ptr) so the forward-declared type is safe to delete here. + std::shared_ptr spipe; + std::function next = nullptr; bool is_stream() const { return next != nullptr; } + // called when the session is cancelled (e.g. DELETE /v1/stream/). + // server_res_generator overrides this to stop its reader; the default is a no-op. + virtual void stop() {} + virtual ~server_http_res() = default; }; @@ -86,6 +96,7 @@ struct server_http_context { void get(const std::string & path, const handler_t & handler) const; void post(const std::string & path, const handler_t & handler) const; + void del(const std::string & path, const handler_t & handler) const; // Register the Google Cloud Platform (Vertex AI) compat (AIP_PREDICT_ROUTE env var, or /predict) // Must be called AFTER all other API routes are registered diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp index 49b0e423f462..0380f98a3615 100644 --- a/tools/server/server-models.cpp +++ b/tools/server/server-models.cpp @@ -1,14 +1,18 @@ #include "server-common.h" #include "server-models.h" +#include "server-context.h" +#include "server-stream.h" #include "build-info.h" #include "preset.h" #include "download.h" #include // TODO: remove this once we use HTTP client from download.h +#include #include #include +#include #include #include #include @@ -43,14 +47,57 @@ extern char **environ; #define DEFAULT_STOP_TIMEOUT 10 // seconds #define CMD_ROUTER_TO_CHILD_EXIT "cmd_router_to_child:exit" -#define CMD_CHILD_TO_ROUTER_READY "cmd_child_to_router:ready" // also sent when waking up from sleep -#define CMD_CHILD_TO_ROUTER_SLEEP "cmd_child_to_router:sleep" -#define CMD_CHILD_TO_ROUTER_INFO "cmd_child_to_router:info:" // followed by json string +#define CMD_CHILD_TO_ROUTER_STATE "cmd_child_to_router:state:" // followed by json string // address for child process, this is needed because router may run on 0.0.0.0 // ref: https://github.com/ggml-org/llama.cpp/issues/17862 #define CHILD_ADDR "127.0.0.1" +struct server_subproc { + std::optional sproc; // empty while in DOWNLOADING state + std::atomic stopped{false}; // set to cancel a download or signal child process exit + + subprocess_s & get() { + GGML_ASSERT(sproc.has_value() && "subprocess not initialized"); + return sproc.value(); + } + + bool is_alive() { + return sproc.has_value() && subprocess_alive(&sproc.value()); + } + + void request_exit() { + if (sproc.has_value()) { + FILE * stdin_file = subprocess_stdin(&sproc.value()); + if (stdin_file) { + fprintf(stdin_file, "%s\n", CMD_ROUTER_TO_CHILD_EXIT); + fflush(stdin_file); + } + } + stopped.store(true, std::memory_order_relaxed); + } + + void terminate() { + if (!sproc.has_value()) { + return; + } +#if defined(_WIN32) + if (sproc->hProcess == NULL) { + return; + } +#else + if (sproc->child <= 0) { + return; + } +#endif + subprocess_terminate(&sproc.value()); + } +}; + +// short loopback budget for the resumable stream router to child JSON calls (probe, lookup, +// delete). distinct from params.timeout_read/write which only applies to the generation proxy +static constexpr int STREAM_LOOKUP_TIMEOUT_MS = 250; + static std::filesystem::path get_server_exec_path() { #if defined(_WIN32) wchar_t buf[32768] = { 0 }; // Large buffer to handle long paths @@ -181,8 +228,8 @@ void server_model_meta::update_caps() { "LLAMA_ARG_HF_REPO_FILE", }); params.offline = true; - // params.skip_download = true; // TODO: ideally, we should validate the model here, but it takes too much time - common_params_handle_models(params, LLAMA_EXAMPLE_SERVER); + common_models_handler handler = common_models_handler_init(params, LLAMA_EXAMPLE_SERVER); + common_models_handler_apply(handler, params); // note: this won't download the model because offline=true if (params.mmproj.path.empty()) { multimodal = { false, false }; } else { @@ -272,14 +319,27 @@ void server_models::add_model(server_model_meta && meta) { meta.update_caps(); std::string name = meta.name; mapping[name] = instance_t{ - /* subproc */ std::make_shared(), + /* subproc */ std::make_shared(), /* th */ std::thread(), /* meta */ std::move(meta) }; } +void server_models::notify_sse(const std::string & event, const std::string & model_id, const json & data) { + std::unique_ptr result = std::make_unique(); + result->data = { + {"model", model_id}, + {"event", event}, + }; + if (!data.is_null()) { + result->data["data"] = data; + } + SRV_DBG("notifying SSE clients about event '%s' for model '%s': %s\n", event.c_str(), model_id.c_str(), safe_json_to_str(result->data).c_str()); + sse.broadcast(std::move(result)); +} + void server_models::load_models() { - // Phase 1: load presets from all sources — pure I/O, no lock needed + // Phase 1: load presets from all sources - pure I/O, no lock needed // 1. cached models common_presets cached_models = ctx_preset.load_from_cache(); SRV_INF("Loaded %zu cached model presets\n", cached_models.size()); @@ -304,21 +364,35 @@ void server_models::load_models() { // note: if a model exists in both cached and local, local takes precedence common_presets final_presets; - for (const auto & [name, preset] : cached_models) final_presets[name] = preset; - for (const auto & [name, preset] : local_models) final_presets[name] = preset; + std::unordered_map source_map; + for (const auto & [name, preset] : cached_models) { + final_presets[name] = preset; + source_map[name] = SERVER_MODEL_SOURCE_CACHE; + } + for (const auto & [name, preset] : local_models) { + final_presets[name] = preset; + source_map[name] = SERVER_MODEL_SOURCE_MODELS_DIR; + } for (const auto & [name, custom] : custom_presets) { if (final_presets.find(name) != final_presets.end()) { final_presets[name].merge(custom); } else { final_presets[name] = custom; } + source_map[name] = SERVER_MODEL_SOURCE_PRESET; } - // server base preset from CLI args takes highest precedence + + // overlay router's own CLI args on top of every model preset so that + // e.g. `llama-server --temp 0` is honoured by all child processes for (auto & [name, preset] : final_presets) { preset.merge(base_preset); } - // Helpers that read `mapping` — must be called while holding the lock. + auto get_source = [&](const std::string & name) { + return source_map.count(name) ? source_map.at(name) : SERVER_MODEL_SOURCE_PRESET; + }; + + // Helpers that read `mapping` - must be called while holding the lock. std::unordered_set custom_names; for (const auto & [name, preset] : custom_presets) custom_names.insert(name); auto join_set = [](const std::set & s) { @@ -366,12 +440,15 @@ void server_models::load_models() { // (unload, load) or when joining threads (the monitoring thread calls update_status // which locks the mutex, so joining while holding it would deadlock). std::unique_lock lk(mutex); + + need_reload = false; bool is_first_load = mapping.empty(); if (is_first_load) { // FIRST LOAD: add all models, then unlock for autoloading for (const auto & [name, preset] : final_presets) { server_model_meta meta{ + /* source */ get_source(name), /* preset */ preset, /* name */ name, /* aliases */ {}, @@ -381,10 +458,11 @@ void server_models::load_models() { /* last_used */ 0, /* args */ std::vector(), /* loaded_info */ {}, + /* progress */ {}, /* exit_code */ 0, /* stop_timeout */ DEFAULT_STOP_TIMEOUT, /* multimodal */ mtmd_caps{false, false}, - /* need_download */ false, + // /* need_download */ false, }; add_model(std::move(meta)); } @@ -453,19 +531,30 @@ void server_models::load_models() { } } for (auto & [name, inst] : mapping) { + if (inst.meta.status == SERVER_MODEL_STATUS_DOWNLOADING) { + continue; // downloading models are not from config sources, leave them alone + } if (final_presets.find(name) == final_presets.end() && !inst.meta.is_running() && inst.th.joinable()) { threads_to_join.push_back(std::move(inst.th)); } } - // join outside the lock — monitoring thread calls update_status (needs lock) + // join outside the lock - monitoring thread calls update_status (needs lock) lk.unlock(); for (auto & th : threads_to_join) th.join(); lk.lock(); // erase models no longer in any source for (auto it = mapping.begin(); it != mapping.end(); ) { - if (final_presets.find(it->first) == final_presets.end()) { + if (it->second.meta.status == SERVER_MODEL_STATUS_DOWNLOADING) { + ++it; // download thread is still busy, skip + } else if (it->second.meta.status == SERVER_MODEL_STATUS_DOWNLOADED) { + // download finished, safe to erase + if (it->second.th.joinable()) { + it->second.th.join(); + } + it = mapping.erase(it); + } else if (final_presets.find(it->first) == final_presets.end()) { SRV_INF("(reload) removing model name=%s (no longer in source)\n", it->first.c_str()); GGML_ASSERT(!it->second.th.joinable()); // must have been joined above it = mapping.erase(it); @@ -526,6 +615,7 @@ void server_models::load_models() { for (const auto & [name, preset] : final_presets) { if (mapping.find(name) == mapping.end()) { server_model_meta meta{ + /* source */ get_source(name), /* preset */ preset, /* name */ name, /* aliases */ {}, @@ -535,10 +625,11 @@ void server_models::load_models() { /* last_used */ 0, /* args */ std::vector(), /* loaded_info */ {}, + /* progress */ {}, /* exit_code */ 0, /* stop_timeout */ DEFAULT_STOP_TIMEOUT, /* multimodal */ mtmd_caps{false, false}, - /* need_download */ false, + // /* need_download */ false, }; add_model(std::move(meta)); newly_added.push_back(name); @@ -547,7 +638,7 @@ void server_models::load_models() { apply_stop_timeout(); - // clear reload flag before unlocking for autoload — load() blocks on !is_reloading, + // clear reload flag before unlocking for autoload - load() blocks on !is_reloading, // so clearing it here (while still locked) prevents a deadlock in the autoload calls below is_reloading = false; cv.notify_all(); @@ -571,6 +662,8 @@ void server_models::load_models() { SRV_INF("(reload) loading new model %s\n", name.c_str()); load(name); } + + notify_sse("models_reload", "*"); } } @@ -597,7 +690,13 @@ bool server_models::has_model(const std::string & name) { } std::optional server_models::get_meta(const std::string & name) { - std::lock_guard lk(mutex); + std::unique_lock lk(mutex); + if (need_reload) { + lk.unlock(); + load_models(); + lk.lock(); + } + auto it = mapping.find(name); if (it != mapping.end()) { return it->second.meta; @@ -683,7 +782,13 @@ static std::vector to_char_ptr_array(const std::vector & ve } std::vector server_models::get_all_meta() { - std::lock_guard lk(mutex); + std::unique_lock lk(mutex); + if (need_reload) { + lk.unlock(); + load_models(); + lk.lock(); + } + std::vector result; result.reserve(mapping.size()); for (const auto & [name, inst] : mapping) { @@ -726,17 +831,23 @@ void server_models::unload_lru() { } void server_models::load(const std::string & name) { - if (!has_model(name)) { - throw std::runtime_error("model name=" + name + " is not found"); + load(name, load_options{}); +} + +void server_models::load(const std::string & name, const load_options & opts) { + if (!opts.custom_meta.has_value()) { + if (!has_model(name)) { + throw std::runtime_error("model name=" + name + " is not found"); + } + unload_lru(); } - unload_lru(); std::unique_lock lk(mutex); // edge case: block until any in-progress reload has finished so we always load // against the freshest preset and a consistent mapping state cv.wait(lk, [this]() { return !is_reloading; }); - auto meta = mapping[name].meta; + auto meta = opts.custom_meta.has_value() ? *opts.custom_meta : mapping[name].meta; if (meta.status != SERVER_MODEL_STATUS_UNLOADED) { SRV_INF("model %s is not ready\n", name.c_str()); return; @@ -770,7 +881,7 @@ void server_models::load(const std::string & name) { throw std::runtime_error("failed to get a port number"); } - inst.subproc = std::make_shared(); + inst.subproc = std::make_shared(); { SRV_INF("spawning server instance with name=%s on port %d\n", inst.meta.name.c_str(), inst.meta.port); @@ -780,6 +891,12 @@ void server_models::load(const std::string & name) { std::vector child_env = base_env; // copy child_env.push_back("LLAMA_SERVER_ROUTER_PORT=" + std::to_string(base_params.port)); + if (opts.mode == SERVER_CHILD_MODE_DOWNLOAD) { + inst.meta.status = SERVER_MODEL_STATUS_DOWNLOADING; + child_env.push_back("LLAMA_SERVER_CHILD_MODE=download"); + child_env.push_back("LLAMA_ARG_HF_REPO=" + name); + } + SRV_INF("%s", "spawning server instance with args:\n"); for (const auto & arg : child_args) { SRV_INF(" %s\n", arg.c_str()); @@ -792,19 +909,24 @@ void server_models::load(const std::string & name) { // TODO @ngxson : maybe separate stdout and stderr in the future // so that we can use stdout for commands and stderr for logging int options = subprocess_option_no_window | subprocess_option_combined_stdout_stderr; - int result = subprocess_create_ex(argv.data(), options, envp.data(), inst.subproc.get()); + inst.subproc->sproc.emplace(); + int result = subprocess_create_ex(argv.data(), options, envp.data(), &inst.subproc->get()); if (result != 0) { throw std::runtime_error("failed to spawn server instance"); } - - inst.stdin_file = subprocess_stdin(inst.subproc.get()); } // start a thread to manage the child process // captured variables are guaranteed to be destroyed only after the thread is joined - inst.th = std::thread([this, name, child_proc = inst.subproc, port = inst.meta.port, stop_timeout = inst.meta.stop_timeout]() { - FILE * stdin_file = subprocess_stdin(child_proc.get()); - FILE * stdout_file = subprocess_stdout(child_proc.get()); // combined stdout/stderr + inst.th = std::thread([ + this, name, + child_proc = inst.subproc, + port = inst.meta.port, + stop_timeout = inst.meta.stop_timeout, + child_mode = opts.mode + ]() { + FILE * stdin_file = subprocess_stdin(&child_proc->get()); + FILE * stdout_file = subprocess_stdout(&child_proc->get()); // combined stdout/stderr std::thread log_thread([&]() { // read stdout/stderr and forward to main server log @@ -815,12 +937,8 @@ void server_models::load(const std::string & name) { while (fgets(buffer, vec_buf.size(), stdout_file) != nullptr) { LOG("[%5d] %s", port, buffer); std::string str(buffer); - if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_READY)) { - this->update_status(name, SERVER_MODEL_STATUS_LOADED, 0); - } else if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_INFO)) { - this->update_loaded_info(name, str); - } else if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_SLEEP)) { - this->update_status(name, SERVER_MODEL_STATUS_SLEEPING, 0); + if (string_starts_with(buffer, CMD_CHILD_TO_ROUTER_STATE)) { + this->handle_child_state(name, str); } } } else { @@ -829,50 +947,49 @@ void server_models::load(const std::string & name) { }); std::thread stopping_thread([&]() { - // thread to monitor stopping signal OR child crash + // thread to monitor explicit stop requests; child crash is signalled via child_proc->stopped auto is_stopping = [this, &name]() { return this->stopping_models.find(name) != this->stopping_models.end(); }; - auto should_wake = [&]() { - return is_stopping() || !subprocess_alive(child_proc.get()); - }; { std::unique_lock lk(this->mutex); - this->cv_stop.wait(lk, should_wake); + this->cv_stop.wait(lk, [&]() { + return is_stopping() || child_proc->stopped.load(std::memory_order_acquire); + }); } - // child may have already exited (e.g. crashed) — skip shutdown sequence - if (!subprocess_alive(child_proc.get())) { + // child crashed or finished on its own, skip graceful shutdown sequence + if (child_proc->stopped.load(std::memory_order_acquire)) { return; } SRV_INF("stopping model instance name=%s\n", name.c_str()); - // send interrupt to child process fprintf(stdin_file, "%s\n", CMD_ROUTER_TO_CHILD_EXIT); fflush(stdin_file); - // wait to stop gracefully or timeout int64_t start_time = ggml_time_ms(); while (true) { std::unique_lock lk(this->mutex); - if (!is_stopping()) { - return; // already stopped + if (!is_stopping() || child_proc->stopped.load(std::memory_order_acquire)) { + return; } int64_t elapsed = ggml_time_ms() - start_time; if (elapsed >= stop_timeout * 1000) { - // timeout, force kill + lk.unlock(); SRV_WRN("force-killing model instance name=%s after %d seconds timeout\n", name.c_str(), stop_timeout); - subprocess_terminate(child_proc.get()); + child_proc->terminate(); return; } - this->cv_stop.wait_for(lk, std::chrono::seconds(1)); + this->cv_stop.wait_for(lk, std::chrono::seconds(1), [&]() { + return !is_stopping() || child_proc->stopped.load(std::memory_order_acquire); + }); } }); - // we reach here when the child process exits + // we reach here when the child process exits (stdout EOF) // note: we cannot join() prior to this point because it will close stdin_file if (log_thread.joinable()) { log_thread.join(); } - // stop the timeout monitoring thread + child_proc->stopped.store(true, std::memory_order_release); { std::lock_guard lk(this->mutex); stopping_models.erase(name); @@ -884,11 +1001,18 @@ void server_models::load(const std::string & name) { // get the exit code int exit_code = 0; - subprocess_join(child_proc.get(), &exit_code); - subprocess_destroy(child_proc.get()); + subprocess_join(&child_proc->get(), &exit_code); + subprocess_destroy(&child_proc->get()); // update status and exit code - this->update_status(name, SERVER_MODEL_STATUS_UNLOADED, exit_code); + if (child_mode == SERVER_CHILD_MODE_DOWNLOAD) { + // instance will be cleaned up on next load_models() call + } else { + this->update_status(name, { + SERVER_MODEL_STATUS_UNLOADED, + exit_code + }); + } SRV_INF("instance name=%s exited with status %d\n", name.c_str(), exit_code); }); @@ -896,30 +1020,41 @@ void server_models::load(const std::string & name) { { auto & old_instance = mapping[name]; // old process should have exited already, but just in case, we clean it up here - if (subprocess_alive(old_instance.subproc.get())) { + if (old_instance.subproc && old_instance.subproc->is_alive()) { SRV_WRN("old process for model name=%s is still alive, this is unexpected\n", name.c_str()); - subprocess_terminate(old_instance.subproc.get()); // force kill + old_instance.subproc->terminate(); // force kill } if (old_instance.th.joinable()) { old_instance.th.join(); } } + notify_sse("model_status", name, { + {"status", server_model_status_to_string(inst.meta.status)}, + }); + mapping[name] = std::move(inst); cv.notify_all(); } void server_models::unload(const std::string & name) { - std::lock_guard lk(mutex); + std::unique_lock lk(mutex); auto it = mapping.find(name); if (it != mapping.end()) { - if (it->second.meta.is_running()) { + if (it->second.meta.status == SERVER_MODEL_STATUS_DOWNLOADING) { + SRV_INF("cancelling download for model name=%s\n", name.c_str()); + it->second.subproc->request_exit(); + // for convenience, we wait the status change here + wait(lk, name, [](const server_model_meta & new_meta) { + return new_meta.status != SERVER_MODEL_STATUS_DOWNLOADING; + }); + } else if (it->second.meta.is_running()) { SRV_INF("stopping model instance name=%s\n", name.c_str()); stopping_models.insert(name); if (it->second.meta.status == SERVER_MODEL_STATUS_LOADING) { // special case: if model is in loading state, unloading means force-killing it SRV_WRN("model name=%s is still loading, force-killing\n", name.c_str()); - subprocess_terminate(it->second.subproc.get()); + it->second.subproc->terminate(); } cv_stop.notify_all(); // status change will be handled by the managing thread @@ -934,7 +1069,10 @@ void server_models::unload_all() { { std::lock_guard lk(mutex); for (auto & [name, inst] : mapping) { - if (inst.meta.is_running()) { + if (inst.meta.status == SERVER_MODEL_STATUS_DOWNLOADING) { + SRV_INF("cancelling download for model name=%s\n", name.c_str()); + inst.subproc->stopped.store(true, std::memory_order_relaxed); + } else if (inst.meta.is_running()) { SRV_INF("stopping model instance name=%s\n", name.c_str()); stopping_models.insert(name); cv_stop.notify_all(); @@ -951,48 +1089,148 @@ void server_models::unload_all() { } } -void server_models::update_status(const std::string & name, server_model_status status, int exit_code) { +void server_models::update_status(const std::string & name, const update_status_args & args) { std::unique_lock lk(mutex); auto it = mapping.find(name); if (it != mapping.end()) { auto & meta = it->second.meta; - meta.status = status; - meta.exit_code = exit_code; + meta.status = args.status; + meta.exit_code = args.exit_code; + if (!args.loaded_info.is_null()) { + meta.loaded_info = args.loaded_info; + } + if (!args.progress.is_null()) { + meta.progress = args.progress; + } + } + // broadcast status change to SSE + { + json data = { + {"status", server_model_status_to_string(args.status)}, + }; + if (args.status == SERVER_MODEL_STATUS_UNLOADED) { + data["exit_code"] = args.exit_code; + } + if (!args.loaded_info.is_null()) { + data["info"] = args.loaded_info; + } + if (!args.progress.is_null()) { + data["progress"] = args.progress; + } + // note: notify_sse doesn't acquire the lock, so no deadlock here + notify_sse("status_change", name, data); } cv.notify_all(); } -void server_models::update_loaded_info(const std::string & name, std::string & raw_info) { - if (!string_starts_with(raw_info, CMD_CHILD_TO_ROUTER_INFO)) { - SRV_WRN("invalid loaded info format from child for model name=%s: %s\n", name.c_str(), raw_info.c_str()); - return; +void server_models::update_download_progress(const std::string & name, const common_download_progress & progress, bool done, bool ok) { + json curr; + { + std::lock_guard lk(mutex); + auto it = mapping.find(name); + if (it != mapping.end()) { + if (done) { + // mark the instance to be erased on next load_models() call + it->second.meta.status = SERVER_MODEL_STATUS_DOWNLOADED; + need_reload = true; + } else { + json & info = it->second.meta.loaded_info; + if (!info.contains("progress")) { + info["progress"] = json{}; + } + info["progress"][progress.url] = { + {"done", progress.downloaded}, + {"total", progress.total}, + }; + curr = it->second.meta.loaded_info; // copy + } + } } - - json info; - try { - info = json::parse(raw_info.substr(strlen(CMD_CHILD_TO_ROUTER_INFO))); - } catch (const std::exception & e) { - SRV_WRN("failed to parse loaded info from child for model name=%s: %s\n", name.c_str(), e.what()); - return; + if (done) { + cv.notify_all(); // notify in case unload() is waiting for download to be cancelled + notify_sse(ok ? "download_finished" : "download_failed", name, {}); + } else { + notify_sse("download_progress", name, curr); } +} +bool server_models::remove(const std::string & name) { + // do everything under one lock acquisition; avoid get_meta() / + // unload() because they can trigger load_models() which erases + // transient DOWNLOADING / DOWNLOADED entries as a side-effect std::unique_lock lk(mutex); + auto it = mapping.find(name); - if (it != mapping.end()) { - auto & meta = it->second.meta; - meta.loaded_info = info; + if (it == mapping.end()) { + throw std::runtime_error("model name=" + name + " is not found"); } - cv.notify_all(); + if (it->second.meta.source != SERVER_MODEL_SOURCE_CACHE) { + throw std::runtime_error("model name=" + name + " is not removable (not from cache)"); + } + + if (it->second.meta.status == SERVER_MODEL_STATUS_DOWNLOADING) { + // cancel in-flight download + SRV_INF("cancelling download for model name=%s\n", name.c_str()); + it->second.subproc->request_exit(); + } else if (it->second.meta.is_running()) { + // stop running instance + SRV_INF("stopping model instance name=%s\n", name.c_str()); + stopping_models.insert(name); + if (it->second.meta.status == SERVER_MODEL_STATUS_LOADING) { + it->second.subproc->terminate(); + } + cv_stop.notify_all(); + } + + // wait until the monitoring thread finishes + wait(lk, name, [](const server_model_meta & meta) { + return meta.status == SERVER_MODEL_STATUS_UNLOADED + || meta.status == SERVER_MODEL_STATUS_DOWNLOADED; + }); + + // re-find after wait - load_models() may have erased the entry during the wait + it = mapping.find(name); + if (it == mapping.end()) { + // load_models() already joined the thread and erased the entry; + // we just need to clean up the cached files on disk + lk.unlock(); + bool ok = common_download_remove(name); + SRV_INF("removing model name=%s from cache (%s)\n", name.c_str(), ok ? "succeeded" : "partial"); + notify_sse("model_remove", name, {}); + return true; + } + + // join before erasing - thread no longer acquires this mutex + if (it->second.th.joinable()) { + it->second.th.join(); + } + + // remove from disk (best-effort: cancelled downloads may have no cached files) + bool ok = common_download_remove(name); + mapping.erase(name); + if (!ok) { + SRV_WRN("removing model name=%s from disk returned false (no cached files?)\n", name.c_str()); + } + SRV_INF("removing model name=%s from cache (%s)\n", name.c_str(), ok ? "succeeded" : "partial"); + notify_sse("model_remove", name, {}); + return true; } -void server_models::wait_until_loading_finished(const std::string & name) { +void server_models::wait(const std::string & name, std::function predicate) { std::unique_lock lk(mutex); - cv.wait(lk, [this, &name]() { + wait(lk, name, predicate); +} + +void server_models::wait(std::unique_lock & lk, const std::string & name, std::function predicate) { + cv.wait(lk, [this, &name, &predicate]() { auto it = mapping.find(name); if (it != mapping.end()) { - return it->second.meta.status != SERVER_MODEL_STATUS_LOADING; + return predicate(it->second.meta); + } - return false; + // model was removed from mapping by another code path (e.g. load_models()). + // nothing left to wait for - tell the caller to proceed. + return true; }); } @@ -1014,10 +1252,15 @@ bool server_models::ensure_model_ready(const std::string & name) { // wait for loading to complete SRV_INF("waiting until model name=%s is fully loaded...\n", name.c_str()); - wait_until_loading_finished(name); + wait(name, [&meta](const server_model_meta & new_meta) { + if (new_meta.status != SERVER_MODEL_STATUS_LOADING) { + meta = new_meta; // update meta for final check after wait + return true; + } + return false; + }); // check final status - meta = get_meta(name); if (!meta.has_value() || meta->is_failed()) { throw std::runtime_error("model name=" + name + " failed to load"); } @@ -1058,21 +1301,169 @@ server_http_res_ptr server_models::proxy_request(const server_http_req & req, co return proxy; } -bool server_models::is_child_server() { +void server_models::handle_child_state(const std::string & name, const std::string & raw_input) { + server_state state; + json payload; + + try { + json data = json::parse(raw_input.substr(strlen(CMD_CHILD_TO_ROUTER_STATE))); + state = server_state_from_str(json_value(data, "state", std::string())); + payload = json_value(data, "payload", json{}); + } catch (const std::exception & e) { + SRV_ERR("failed to parse child state update for name=%s: %s\n", name.c_str(), e.what()); + return; + } + + switch (state) { + case SERVER_STATE_DOWNLOADING: + { + std::string result = json_value(payload, "result", std::string()); + std::string url = json_value(payload, "url", std::string()); + auto request_exit = [&]() { + std::lock_guard lk(mutex); + auto it = mapping.find(name); + if (it != mapping.end()) { + return it->second.subproc->request_exit(); + } + }; + if (result == "download_finished") { + update_download_progress(name, {}, true, true); + request_exit(); + } else if (result == "download_failed") { + update_download_progress(name, {}, true, false); + request_exit(); + } else if (!url.empty()) { + common_download_progress p; + p.url = url; + p.downloaded = json_value(payload, "downloaded", (size_t)0); + p.total = json_value(payload, "total", (size_t)0); + update_download_progress(name, p, false); + } + } break; + case SERVER_STATE_LOADING: + { + update_status(name, { + SERVER_MODEL_STATUS_LOADING, + 0, + nullptr, // no loaded_info yet + payload, + }); + } break; + case SERVER_STATE_READY: + { + update_status(name, { + SERVER_MODEL_STATUS_LOADED, + 0, + // note: payload can be empty if this is a wakeup from sleep + payload.size() > 0 ? payload : nullptr, + {}, // reset progress info + }); + } break; + case SERVER_STATE_SLEEPING: + { + update_status(name, { SERVER_MODEL_STATUS_SLEEPING }); + } break; + default: + // should never happen, but just in case + GGML_ASSERT(false && "unexpected state from child server"); + } +} + +// +// server_child +// + +bool server_child::is_child() { const char * router_port = std::getenv("LLAMA_SERVER_ROUTER_PORT"); return router_port != nullptr; } -std::thread server_models::setup_child_server(const std::function & shutdown_handler, const json & model_info) { - // send a notification to the router server that a model instance is ready - common_log_pause(common_log_main()); - fflush(stdout); - fprintf(stdout, "%s\n", CMD_CHILD_TO_ROUTER_READY); - fflush(stdout); - fprintf(stdout, "%s%s\n", CMD_CHILD_TO_ROUTER_INFO, safe_json_to_str(model_info).c_str()); - fflush(stdout); - common_log_resume(common_log_main()); +server_child_mode server_child::get_mode() { + const char * mode = std::getenv("LLAMA_SERVER_CHILD_MODE"); + std::string mode_str(mode ? mode : ""); + if (mode_str == "download") { + return SERVER_CHILD_MODE_DOWNLOAD; + } else { + return SERVER_CHILD_MODE_NORMAL; + } +} + +struct server_download_state : public common_download_callback { + server_child * self; + std::function should_stop; + std::atomic last_progress_time{0}; // multiple files downloading in different threads + bool is_ok = false; + + server_download_state(server_child * s) : self(s) {} + + bool run(common_params & params) { + try { + common_models_handler handler = common_models_handler_init(params, LLAMA_EXAMPLE_SERVER); + common_models_handler_apply(handler, params, this); + is_ok = true; + } catch (const std::exception & e) { + auto model_name = params.model.get_name(); + SRV_ERR("download failed for model name=%s: %s\n", model_name.c_str(), e.what()); + is_ok = false; + } + return is_ok; + } + void on_progress(const common_download_progress & p) { + json data = { + {"url", p.url}, + {"downloaded", p.downloaded}, + {"total", p.total}, + }; + self->notify_to_router(server_state_to_str(SERVER_STATE_DOWNLOADING), data); + } + void on_start(const common_download_progress & p) override { + on_progress(p); + } + void on_update(const common_download_progress & p) override { + int64_t now = ggml_time_ms(); + // throttle progress updates to avoid flooding logs + if (now - last_progress_time.load(std::memory_order_relaxed) >= 100) { + on_progress(p); + last_progress_time.store(now, std::memory_order_relaxed); + } + } + void on_done(const common_download_progress & p, bool) override { + on_progress(p); + } + bool is_cancelled() const override { + return should_stop ? should_stop() : false; + } +}; + +int server_child::run_download(common_params & params) { + auto cancelled = std::make_shared>(false); + + // monitor stdin for cancellation command from the router + std::thread signal_thread = setup([cancelled](int) { + cancelled->store(true, std::memory_order_relaxed); + }); + + server_download_state dl(this); + dl.should_stop = [cancelled]() { + return cancelled->load(std::memory_order_relaxed); + }; + bool ok = dl.run(params); + + notify_to_router(server_state_to_str(SERVER_STATE_DOWNLOADING), { + {"result", ok ? "download_finished" : "download_failed"}, + }); + + // router should send CMD_ROUTER_TO_CHILD_EXIT after receiving the result + if (signal_thread.joinable()) { + signal_thread.join(); + } + + SRV_INF("download completed %s\n", ok ? "successfully" : "with errors"); + return 0; +} + +std::thread server_child::setup(const std::function & shutdown_handler) { // setup thread for monitoring stdin return std::thread([shutdown_handler]() { // wait for EOF on stdin @@ -1098,10 +1489,15 @@ std::thread server_models::setup_child_server(const std::function & s }); } -void server_models::notify_router_sleeping_state(bool is_sleeping) { +void server_child::notify_to_router(const std::string & state, const json & payload) { + json data = { + {"state", state}, + {"payload", payload}, + }; + std::lock_guard lk(mtx_stdout); common_log_pause(common_log_main()); fflush(stdout); - fprintf(stdout, "%s\n", is_sleeping ? CMD_CHILD_TO_ROUTER_SLEEP : CMD_CHILD_TO_ROUTER_READY); + fprintf(stdout, "%s%s\n", CMD_CHILD_TO_ROUTER_STATE, safe_json_to_str(data).c_str()); fflush(stdout); common_log_resume(common_log_main()); } @@ -1111,6 +1507,42 @@ void server_models::notify_router_sleeping_state(bool is_sleeping) { // server_models_routes // +// RAII wrapper similar to server_response_reader, but doesn't use server_queue +static std::atomic sse_client_id_counter = 0; +struct server_models_sse_client { + server_response & queue_results; + int client_id; + server_models_sse_client(server_response & q) + : queue_results(q), client_id(sse_client_id_counter.fetch_add(1, std::memory_order_relaxed)) { + SRV_DBG("new SSE client connected, assigned client_id=%d\n", client_id); + queue_results.add_waiting_task_id(client_id); + } + ~server_models_sse_client() { + SRV_DBG("SSE client disconnected, removing client_id=%d\n", client_id); + queue_results.remove_waiting_task_id(client_id); + } + + // return nullptr if should_stop() is true before receiving a result + // note: if one error is received, it will stop further processing and return error result + server_task_result_ptr next(const std::function & should_stop) { + while (true) { + static const int http_polling_seconds = 1; // check should_stop every 1 second + server_task_result_ptr result = queue_results.recv_with_timeout({client_id}, http_polling_seconds); + if (result == nullptr) { + // timeout, check stop condition + if (should_stop()) { + return nullptr; + } + // continue waiting otherwise + } else { + SRV_DBG("recv result for client_id=%d: %s\n", client_id, safe_json_to_str(result->to_json()).c_str()); + return result; + } + } + // should not reach here + } +}; + static void res_ok(std::unique_ptr & res, const json & response_data) { res->status = 200; res->data = safe_json_to_str(response_data); @@ -1153,6 +1585,45 @@ static bool is_autoload(const common_params & params, const server_http_req & re } } +// percent encode one query or path component, covers reserved chars without pulling in +// httplib::detail. used by the stream routes to forward conversation_id to children safely +static std::string encode_qs(const std::string & in) { + std::string out; + out.reserve(in.size() * 3); + for (unsigned char c : in) { + bool safe = (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') + || c == '-' || c == '_' || c == '.' || c == '~'; + if (safe) { + out.push_back(char(c)); + } else { + char buf[4]; + std::snprintf(buf, sizeof(buf), "%%%02X", c); + out.append(buf, 3); + } + } + return out; +} + +// resolve the child that owns a conversation's stream session via the conv_id -> model map +// populated when the POST was routed. single map lookup then a meta lookup, no polling, no +// parsing of the conv id. returns nullopt when nothing maps, the caller answers not found and +// the client recovers +static std::optional resolve_child_for_conv( + server_models & models, const std::string & conversation_id) { + if (conversation_id.empty()) { + return std::nullopt; + } + auto tracked = models.conv_models.lookup(conversation_id); + if (!tracked.has_value()) { + return std::nullopt; + } + auto meta = models.get_meta(*tracked); + if (meta.has_value() && meta->is_ready()) { + return meta; + } + return std::nullopt; +} + void server_models_routes::init_routes() { this->get_router_props = [this](const server_http_req & req) { std::string name = req.get_param("model"); @@ -1161,9 +1632,9 @@ void server_models_routes::init_routes() { auto res = std::make_unique(); res_ok(res, { // TODO: add support for this on web UI - {"role", "router"}, - {"max_instances", params.models_max}, - {"models_autoload", params.models_autoload}, + {"role", "router"}, + {"max_instances", params.models_max}, + {"models_autoload", params.models_autoload}, // this is a dummy response to make sure the UI doesn't break {"model_alias", "llama-server"}, {"model_path", "none"}, @@ -1172,11 +1643,9 @@ void server_models_routes::init_routes() { {"n_ctx", 0}, }}, // New key - {"ui_settings", ui_settings}, - // Deprecated: use ui_settings instead (kept for backward compat) - {"webui_settings", webui_settings}, - {"build_info", std::string(llama_build_info())}, - {"cors_proxy_enabled", params.ui_mcp_proxy || params.webui_mcp_proxy}, + {"ui_settings", ui_settings}, + {"build_info", std::string(llama_build_info())}, + {"cors_proxy_enabled", params.ui_mcp_proxy}, }); return res; } @@ -1203,6 +1672,12 @@ void server_models_routes::init_routes() { if (!router_validate_model(name, models, autoload, error_res)) { return error_res; } + // remember which child serves this conversation so the stream routes can route straight + // to it without polling, keyed on the exact conv id from the header + std::string conv_id = stream_conv_id_from_headers(req.headers); + if (!conv_id.empty()) { + models.conv_models.remember(conv_id, name); + } return models.proxy_request(req, method, name, true); // update last usage for POST request only }; @@ -1274,7 +1749,9 @@ void server_models_routes::init_routes() { {"created", t}, // for OAI-compat {"status", status}, {"architecture", architecture}, - {"need_download", meta.need_download}, + {"source", server_model_source_to_string(meta.source)}, + {"can_remove", meta.source == SERVER_MODEL_SOURCE_CACHE}, + // {"need_download", meta.need_download}, // TODO: add other fields, may require reading GGUF metadata }; @@ -1304,7 +1781,7 @@ void server_models_routes::init_routes() { res_err(res, format_error_response("model is not found", ERROR_TYPE_INVALID_REQUEST)); return res; } - if (!model->is_running()) { + if (!model->is_running() && model->status != SERVER_MODEL_STATUS_DOWNLOADING) { res_err(res, format_error_response("model is not running", ERROR_TYPE_INVALID_REQUEST)); return res; } @@ -1312,6 +1789,208 @@ void server_models_routes::init_routes() { res_ok(res, {{"success", true}}); return res; }; + + this->get_router_models_sse = [this](const server_http_req & req) { + auto res = std::make_unique(); + res->status = 200; + res->content_type = "text/event-stream"; + auto sse_client = std::make_shared(models.sse); + res->next = [this, sse_client, &req](std::string & output) -> bool { + auto result = sse_client->next([&]() { + return stopping.load(std::memory_order_relaxed) || req.should_stop(); + }); + if (result == nullptr) { + return false; // client disconnected or should_stop + } + output = "data: " + safe_json_to_str(result->to_json()) + "\n\n"; + return true; // listen for the next event + }; + return res; + }; + + this->post_router_models = [this](const server_http_req & req) { + auto res = std::make_unique(); + + json body = json::parse(req.body); + std::string name = json_value(body, "model", std::string()); + if (name.empty()) { + throw std::invalid_argument("model must be a non-empty string"); + } + + common_params p; + p.model.hf_repo = name; + p.hf_token = params.hf_token; + + // validate by fetching metadata + bool ok = false; + try { + common_models_handler_init(p, LLAMA_EXAMPLE_SERVER); + ok = true; + } catch (...) { + SRV_ERR("unknown error while validating model '%s'\n", name.c_str()); + // other exceptions will be handled by the outer ex_wrapper() + throw; + } + + if (!ok) { + throw std::invalid_argument("model validation failed, unable to download"); + } + + // reject if model already exists + if (models.has_model(name)) { + throw std::invalid_argument("model '" + name + "' already exists"); + } + + // then, proceed with the actual download + SRV_INF("starting download for model '%s'\n", name.c_str()); + { + server_models::load_options load_opts; + load_opts.mode = SERVER_CHILD_MODE_DOWNLOAD; + load_opts.custom_meta = server_model_meta{}; + load_opts.custom_meta->source = SERVER_MODEL_SOURCE_CACHE; + load_opts.custom_meta->name = name; + models.load(name, load_opts); + } + + res_ok(res, {{"success", true}}); + return res; + }; + + this->del_router_models = [this](const server_http_req & req) { + auto res = std::make_unique(); + + std::string name = req.get_param("model"); + if (name.empty()) { + throw std::invalid_argument("model must be a non-empty string"); + } + + models.remove(name); // throws on error + + res_ok(res, {{"success", true}}); + return res; + }; + + this->router_stream_get = [this](const server_http_req & req) { + // GET /v1/stream/?from=N. resolve the owning child from the conv_id -> model + // map, 404 when nothing maps + auto res = std::make_unique(); + std::string conv_id = req.get_param("conv_id"); + if (conv_id.empty()) { + res_err(res, format_error_response("Missing conversation id in path", ERROR_TYPE_INVALID_REQUEST)); + return res; + } + std::optional owner = resolve_child_for_conv(models, conv_id); + if (!owner.has_value()) { + res_err(res, format_error_response("Stream not found or expired", ERROR_TYPE_NOT_FOUND)); + return res; + } + std::string from = req.get_param("from"); + std::string child_path = "/v1/stream/" + encode_qs(conv_id); + if (!from.empty()) { + child_path += "?from=" + from; + } + SRV_INF("proxying stream resume to model %s on port %d, path=%s\n", + owner->name.c_str(), owner->port, child_path.c_str()); + auto proxy = std::make_unique( + "GET", + "http", + CHILD_ADDR, + owner->port, + child_path, + req.headers, + req.body, + req.files, + req.should_stop, + params.timeout_read, + params.timeout_write); + return std::unique_ptr(std::move(proxy)); + }; + + this->router_streams_lookup = [this](const server_http_req & req) { + // POST /v1/streams/lookup. resolve each requested conv id to its owning child via the + // map, group the ids per child, and query only the children that actually own some of + // them instead of fanning out to every ready child. a child only answers for the ids + // it owns, never lists anything else + auto res = std::make_unique(); + std::vector requested; + try { + json body = json::parse(req.body); + if (body.contains("conversation_ids") && body["conversation_ids"].is_array()) { + for (const auto & v : body["conversation_ids"]) { + if (v.is_string() && !v.get().empty()) { + requested.push_back(v.get()); + } + } + } + } catch (const std::exception &) { + res_ok(res, json::array()); + return res; + } + + // group requested ids by the child port that owns them, drop ids that map to nothing + std::unordered_map per_child; + for (const auto & cid : requested) { + auto owner = resolve_child_for_conv(models, cid); + if (!owner.has_value()) { + continue; + } + per_child[owner->port].push_back(cid); + } + + json aggregated = json::array(); + for (auto & [port, ids] : per_child) { + json child_body = {{"conversation_ids", ids}}; + httplib::Client cli(CHILD_ADDR, port); + cli.set_connection_timeout(0, STREAM_LOOKUP_TIMEOUT_MS * 1000); + cli.set_read_timeout(0, STREAM_LOOKUP_TIMEOUT_MS * 1000); + cli.set_write_timeout(0, STREAM_LOOKUP_TIMEOUT_MS * 1000); + auto resp = cli.Post("/v1/streams/lookup", child_body.dump(), "application/json"); + if (!resp || resp->status != 200) { + continue; + } + try { + json child_arr = json::parse(resp->body); + if (!child_arr.is_array()) { + continue; + } + for (auto & entry : child_arr) { + if (entry.is_object()) { + aggregated.push_back(entry); + } + } + } catch (const std::exception &) { + continue; + } + } + res_ok(res, aggregated); + return res; + }; + + this->router_stream_delete = [this](const server_http_req & req) { + // DELETE /v1/stream/. resolve the owning child via the map and forward only to + // it, evict_and_cancel is idempotent on the child + auto res = std::make_unique(); + std::string conv_id = req.get_param("conv_id"); + if (conv_id.empty()) { + res_err(res, format_error_response("Missing conversation id in path", ERROR_TYPE_INVALID_REQUEST)); + return res; + } + std::string child_path = "/v1/stream/" + encode_qs(conv_id); + auto owner = resolve_child_for_conv(models, conv_id); + if (owner.has_value()) { + httplib::Client cli(CHILD_ADDR, owner->port); + cli.set_connection_timeout(0, STREAM_LOOKUP_TIMEOUT_MS * 1000); + cli.set_read_timeout(0, STREAM_LOOKUP_TIMEOUT_MS * 1000); + cli.set_write_timeout(0, STREAM_LOOKUP_TIMEOUT_MS * 1000); + auto resp = cli.Delete(child_path.c_str()); + (void) resp; // best effort, 404 and network errors are equivalent to no op + } + // drop the tracking entry, the session is being torn down + models.conv_models.forget(conv_id); + res->status = 204; + res->content_type = "application/json"; + return res; + }; } diff --git a/tools/server/server-models.h b/tools/server/server-models.h index 2198589a7aa2..62bed8725b5b 100644 --- a/tools/server/server-models.h +++ b/tools/server/server-models.h @@ -1,19 +1,26 @@ #pragma once #include "common.h" +#include "download.h" #include "preset.h" #include "server-common.h" #include "server-http.h" +#include "server-queue.h" #include #include #include #include +#include #include +#include +#include /** * state diagram: * + * DOWNLOADING ──► DOWNLOADED ──► (replaced by new instance) + * * UNLOADED ──► LOADING ──► LOADED ◄──── SLEEPING * ▲ │ │ ▲ * └───failed───┘ │ │ @@ -22,39 +29,48 @@ */ enum server_model_status { // TODO: also add downloading state when the logic is added + SERVER_MODEL_STATUS_DOWNLOADING, + SERVER_MODEL_STATUS_DOWNLOADED, SERVER_MODEL_STATUS_UNLOADED, SERVER_MODEL_STATUS_LOADING, SERVER_MODEL_STATUS_LOADED, SERVER_MODEL_STATUS_SLEEPING }; -static server_model_status server_model_status_from_string(const std::string & status_str) { - if (status_str == "unloaded") { - return SERVER_MODEL_STATUS_UNLOADED; - } - if (status_str == "loading") { - return SERVER_MODEL_STATUS_LOADING; - } - if (status_str == "loaded") { - return SERVER_MODEL_STATUS_LOADED; - } - if (status_str == "sleeping") { - return SERVER_MODEL_STATUS_SLEEPING; - } - throw std::runtime_error("invalid server model status"); -} +enum server_model_source { + SERVER_MODEL_SOURCE_PRESET, + SERVER_MODEL_SOURCE_MODELS_DIR, + SERVER_MODEL_SOURCE_CACHE, +}; + +enum server_child_mode { + SERVER_CHILD_MODE_NORMAL, // load the model and run normally + SERVER_CHILD_MODE_DOWNLOAD, // download the model and exit +}; static std::string server_model_status_to_string(server_model_status status) { switch (status) { - case SERVER_MODEL_STATUS_UNLOADED: return "unloaded"; - case SERVER_MODEL_STATUS_LOADING: return "loading"; - case SERVER_MODEL_STATUS_LOADED: return "loaded"; - case SERVER_MODEL_STATUS_SLEEPING: return "sleeping"; - default: return "unknown"; + case SERVER_MODEL_STATUS_DOWNLOADING: return "downloading"; + case SERVER_MODEL_STATUS_DOWNLOADED: return "downloaded"; + case SERVER_MODEL_STATUS_UNLOADED: return "unloaded"; + case SERVER_MODEL_STATUS_LOADING: return "loading"; + case SERVER_MODEL_STATUS_LOADED: return "loaded"; + case SERVER_MODEL_STATUS_SLEEPING: return "sleeping"; + default: return "unknown"; + } +} + +static std::string server_model_source_to_string(server_model_source source) { + switch (source) { + case SERVER_MODEL_SOURCE_PRESET: return "preset"; + case SERVER_MODEL_SOURCE_MODELS_DIR: return "models_dir"; + case SERVER_MODEL_SOURCE_CACHE: return "cache"; + default: return "unknown"; } } struct server_model_meta { + server_model_source source = SERVER_MODEL_SOURCE_CACHE; common_preset preset; std::string name; std::set aliases; // additional names that resolve to this model @@ -63,11 +79,12 @@ struct server_model_meta { server_model_status status = SERVER_MODEL_STATUS_UNLOADED; int64_t last_used = 0; // for LRU unloading std::vector args; // args passed to the model instance, will be populated by render_args() - json loaded_info; // info to be reflected via /v1/models endpoint + json loaded_info; // info to be reflected via /v1/models endpoint ; if in DOWNLOADING state, it should contain download progress info + json progress; // reflect load or download progress info, if any int exit_code = 0; // exit code of the model instance process (only valid if status == FAILED) int stop_timeout = 0; // seconds to wait before force-killing the model instance during shutdown mtmd_caps multimodal; // multimodal capabilities - bool need_download = false; // whether the model needs to be downloaded before loading + // bool need_download = false; // whether the model needs to be downloaded before loading // TODO @ngxson: implement this bool is_ready() const { return status == SERVER_MODEL_STATUS_LOADED; @@ -85,15 +102,17 @@ struct server_model_meta { void update_caps(); }; -struct subprocess_s; +struct server_models_routes; +struct server_subproc; // defined in server-models.cpp struct server_models { + friend struct server_models_routes; + private: struct instance_t { - std::shared_ptr subproc; // shared between main thread and monitoring thread + std::shared_ptr subproc; // shared between main thread and monitoring thread std::thread th; server_model_meta meta; - FILE * stdin_file = nullptr; }; std::mutex mutex; @@ -107,6 +126,47 @@ struct server_models { // set to true while load_models() is executing a reload; load() will wait until clear bool is_reloading = false; + // if true, the next get_meta() will trigger a reload of model list + bool need_reload = false; + + // conv_id -> model name that currently serves its stream session, lets the resumable stream + // routes go straight to the owning child instead of polling every one. populated when + // proxy_request forwards a POST carrying an X-Conversation-Id. best effort: a stale entry just + // makes the child answer not found and the client recovers. owns its lock, one mutex per struct + struct conv_model_tracker { + void remember(const std::string & conv_id, const std::string & model) { + if (conv_id.empty() || model.empty()) { + return; + } + std::lock_guard lock(mu); + map[conv_id] = model; + } + + std::optional lookup(const std::string & conv_id) { + if (conv_id.empty()) { + return std::nullopt; + } + std::lock_guard lock(mu); + auto it = map.find(conv_id); + if (it == map.end()) { + return std::nullopt; + } + return it->second; + } + + void forget(const std::string & conv_id) { + if (conv_id.empty()) { + return; + } + std::lock_guard lock(mu); + map.erase(conv_id); + } + + private: + std::mutex mu; + std::unordered_map map; + }; + common_preset_context ctx_preset; common_params base_params; @@ -122,9 +182,17 @@ struct server_models { // not thread-safe, caller must hold mutex void add_model(server_model_meta && meta); + // notify SSE clients + void notify_sse(const std::string & event, const std::string & model_id, const json & data = nullptr); + public: + // conv_id -> model tracker for the resumable stream routes, owns its lock + conv_model_tracker conv_models; + server_models(const common_params & params, int argc, char ** argv); + server_response sse; // for real-time updates via SSE endpoint + // (re-)load the list of models from various sources and prepare the metadata mapping // - if this is called the first time, simply populate the metadata // - if this is called subsequently (e.g. when refreshing from disk): @@ -141,19 +209,39 @@ struct server_models { // return a copy of all model metadata (thread-safe) std::vector get_all_meta(); + struct load_options { + server_child_mode mode = SERVER_CHILD_MODE_NORMAL; + // used for spawning a downloading child process + std::optional custom_meta = std::nullopt; + }; + // load and unload model instances // these functions are thread-safe void load(const std::string & name); + void load(const std::string & name, const load_options & opts); void unload(const std::string & name); void unload_all(); + struct update_status_args { + server_model_status status; + int exit_code = 0; // only valid if status == UNLOADED + json loaded_info = nullptr; + json progress = nullptr; + }; // update the status of a model instance (thread-safe) - void update_status(const std::string & name, server_model_status status, int exit_code); - void update_loaded_info(const std::string & name, std::string & raw_info); + // also send SSE notification to /models/sse endpoint + void update_status(const std::string & name, const update_status_args & args); + void update_download_progress(const std::string & name, const common_download_progress & progress, bool done, bool ok = true); + + // remove a cache model from disk and update the list (thread-safe) + // note: only cache models can be removed; returns false if the model doesn't exist or is not a cache model + bool remove(const std::string & name); // wait until the model instance is fully loaded (thread-safe) + // note: predicate is called while holding the lock // return when the model no longer in "loading" state - void wait_until_loading_finished(const std::string & name); + void wait(const std::string & name, std::function predicate); + void wait(std::unique_lock & lk, const std::string & name, std::function predicate); // ensure the model is in ready state (thread-safe) // return false if model is ready @@ -163,33 +251,47 @@ struct server_models { // proxy an HTTP request to the model instance server_http_res_ptr proxy_request(const server_http_req & req, const std::string & method, const std::string & name, bool update_last_used); + // handle message sent from server_child::notify_to_router() + // raw input must starts with CMD_CHILD_TO_ROUTER_STATE, followed by a JSON string + // this function is not thread-safe, must be called from instance's monitoring thread + // payload per state: + // state = loading -> payload = {} (TODO: add progress info) + // state = ready -> payload = model_info (json), or {} if wakeup from sleeping + // state = sleeping -> payload = {} + void handle_child_state(const std::string & name, const std::string & raw_input); +}; + +struct server_child { + // serializes the notify_to_router writes + std::mutex mtx_stdout; + std::atomic is_finished_downloading = false; // set by run_download + // return true if the current process is a child server instance - static bool is_child_server(); + bool is_child(); + server_child_mode get_mode(); + int run_download(common_params & params); - // notify the router server that a model instance is ready + // register the shutdown_handler to be called by the router // return the monitoring thread (to be joined by the caller) - static std::thread setup_child_server(const std::function & shutdown_handler, const json & model_info); + std::thread setup(const std::function & shutdown_handler); - // notify the router server that the sleeping state has changed - static void notify_router_sleeping_state(bool sleeping); + // notify router server for status changes (e.g. loading, downloading, sleeping, etc.) + // message will be handled by server_models::handle_child_state() on the router side + void notify_to_router(const std::string & state_name, const json & payload); }; struct server_models_routes { common_params params; - json ui_settings = json::object(); // Primary: new name - json webui_settings = json::object(); // Deprecated: use ui_settings (kept for compat) + json ui_settings = json::object(); // Primary: new name + std::atomic stopping = false; // for graceful disconnecting SSE clients during shutdown server_models models; server_models_routes(const common_params & params, int argc, char ** argv) : params(params), models(params, argc, argv) { - // Support both new ui_config_json and deprecated webui_config_json - const std::string & cfg = !this->params.ui_config_json.empty() - ? this->params.ui_config_json - : this->params.webui_config_json; + const std::string & cfg = this->params.ui_config_json; if (!cfg.empty()) { try { json json_settings = json::parse(cfg); ui_settings = json_settings; - webui_settings = json_settings; // Deprecated: keep in sync } catch (const std::exception & e) { LOG_ERR("%s: failed to parse UI config: %s\n", __func__, e.what()); throw; @@ -206,6 +308,16 @@ struct server_models_routes { server_http_context::handler_t get_router_models; server_http_context::handler_t post_router_models_load; server_http_context::handler_t post_router_models_unload; + // management API + server_http_context::handler_t get_router_models_sse; + server_http_context::handler_t post_router_models; + server_http_context::handler_t del_router_models; + + // router side handlers for the resumable streaming routes. each resolves the child that owns + // a conversation through the conv_id -> model map, no probing or fan out + server_http_context::handler_t router_stream_get; + server_http_context::handler_t router_streams_lookup; + server_http_context::handler_t router_stream_delete; }; /** diff --git a/tools/server/server-queue.cpp b/tools/server/server-queue.cpp index 32cfe7830c35..5d37c34536e3 100644 --- a/tools/server/server-queue.cpp +++ b/tools/server/server-queue.cpp @@ -331,6 +331,17 @@ void server_response::send(server_task_result_ptr && result) { } } +void server_response::broadcast(server_task_result_ptr && result) { + std::unique_lock lock(mutex_results); + for (const auto & id_task : waiting_task_ids) { + RES_DBG("task id = %d pushed to result queue\n", id_task); + server_task_result_ptr res_copy(result->clone()); + res_copy->id = id_task; // override id with target task id + queue_results.emplace_back(std::move(res_copy)); + } + condition_results.notify_all(); +} + void server_response::terminate() { running = false; condition_results.notify_all(); diff --git a/tools/server/server-queue.h b/tools/server/server-queue.h index 35f010401fcb..0b674d6ff0f9 100644 --- a/tools/server/server-queue.h +++ b/tools/server/server-queue.h @@ -154,11 +154,15 @@ struct server_response { // Send a new result to a waiting id_task void send(server_task_result_ptr && result); + // broadcast a new result to all waiting tasks + // (used by router mode) + void broadcast(server_task_result_ptr && result); + // terminate the waiting loop void terminate(); }; -// utility class to make working with server_queue and server_response easier +// RAII wrapper to make working with server_queue and server_response easier // it provides a generator-like API for server responses // support pooling connection state and aggregating multiple results struct server_response_reader { diff --git a/tools/server/server-schema.cpp b/tools/server/server-schema.cpp new file mode 100644 index 000000000000..ed4bda24122b --- /dev/null +++ b/tools/server/server-schema.cpp @@ -0,0 +1,638 @@ +#include "server-schema.h" + +#include "json-schema-to-grammar.h" + +namespace server_schema { + +// +// llama.cpp-specific completion schema +// + +std::vector> make_llama_cmpl_schema(const common_params & params_base, task_params & params) { + std::vector> fields; + auto add = [&](field * f) { + fields.emplace_back(f); + }; + + add((new field_bool("verbose", params.verbose)) + ->set_desc("Include __verbose field in the response with additional debug information")); + + add((new field_bool("timings_per_token", params.timings_per_token)) + ->set_desc("Include prompt processing and text generation speed information in each response")); + + add((new field_bool("stream", params.stream)) + ->set_desc("Allows receiving each predicted token in real-time instead of waiting for the completion to finish")); + + add((new field_nested("stream_options")) + ->add_subfield((new field_bool("include_usage", params.include_usage)) + ->set_desc("Whether to include usage information in the stream")) + ->set_desc("Additional options for streaming responses")); + + add((new field_bool("cache_prompt", params.cache_prompt)) + ->set_desc("Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests")); + + add((new field_bool("return_tokens", params.return_tokens)) + ->set_desc("Return the raw generated token ids in the `tokens` field")); + + add((new field_bool("return_progress", params.return_progress)) + ->set_desc("Include prompt processing progress events in stream mode")); + + add((new field_num("n_predict", params.n_predict)) + ->set_hard_limits(-1, INT32_MAX) + ->add_alias("max_completion_tokens") + ->add_alias("max_tokens") + ->set_desc("Set the maximum number of tokens to predict. When 0, no tokens will be generated but the prompt is evaluated into the cache")); + + add((new field_num("n_indent", params.n_indent)) + ->set_hard_limits(0, INT32_MAX) + ->set_desc("Specify the minimum line indentation for the generated text in number of whitespace characters. Useful for code completion tasks")); + + add((new field_num("n_keep", params.n_keep)) + ->set_hard_limits(-1, INT32_MAX) + ->set_desc("Specify the number of tokens from the initial prompt to retain when context size is exceeded. Use -1 to retain all tokens from the prompt")); + + add((new field_num("n_discard", params.n_discard)) + ->set_hard_limits(0, INT32_MAX) + ->set_desc("Number of tokens after n_keep that may be discarded when shifting context (0 = half context)")); + + add((new field_num("n_cmpl", params.n_cmpl)) + ->set_hard_limits(1, params_base.n_parallel) + ->add_alias("n") // alias "n" as fallback (OpenAI completions API) + ->set_desc("Number of completions to generate. If the input has multiple prompts, total outputs will be N prompts times n_cmpl")); + + add((new field_num("n_cache_reuse", params.n_cache_reuse)) + ->set_hard_limits(0, INT32_MAX) + ->set_desc("Min chunk size to attempt reusing from the cache via KV shifting. See --cache-reuse arg")); + + // TODO: implement t_max_prompt_ms + // add((new field_num("t_max_prompt_ms", params.t_max_prompt_ms)) + + add((new field_num("t_max_predict_ms", params.t_max_predict_ms)) + ->set_hard_limits(-1, std::numeric_limits::max()) + ->set_desc("Set a time limit in milliseconds for the prediction phase. The timeout triggers if generation exceeds this time (measured since the first token) and a newline has been generated. Useful for FIM applications")); + + add((new field_json("response_fields")) + ->set_desc("A list of response fields to return. Missing fields are omitted without error. Fields with a slash are unnested (e.g. generation_settings/n_predict moves n_predict to the root)") + ->set_handler([&](field_eval_context & ctx, const json & data) { + ctx.params.response_fields = json_value(data, "response_fields", std::vector()); + })); + + + // + // Sampling params + // + + add((new field_num("top_k", params.sampling.top_k)) + ->set_limits(0, INT32_MAX) + ->set_desc("Limit the next token selection to the K most probable tokens (0 = disabled)")); + + add((new field_num("top_p", params.sampling.top_p)) + ->set_limits(0.0f, 1.0f) + ->set_desc("Limit the next token selection to a subset of tokens with cumulative probability above threshold P (1.0 = disabled)")); + + add((new field_num("min_p", params.sampling.min_p)) + ->set_limits(0.0f, 1.0f) + ->set_desc("The minimum probability for a token to be considered, relative to the probability of the most likely token (0 = disabled)")); + + add((new field_num("top_n_sigma", params.sampling.top_n_sigma)) + ->set_desc("Keep tokens within n standard deviations of the top token logit (< 0 = disabled)")); + + add((new field_num("xtc_probability", params.sampling.xtc_probability)) + ->set_limits(0.0f, 1.0f) + ->set_desc("Set the chance for token removal via XTC sampler (0 = disabled)")); + + add((new field_num("xtc_threshold", params.sampling.xtc_threshold)) + ->set_limits(0.0f, 1.0f) + ->set_desc("Set a minimum probability threshold for tokens to be removed via XTC sampler (> 0.5 disables XTC)")); + + add((new field_num("typical_p", params.sampling.typ_p)) + // ->set_limits(0.0f, 1.0f) // what's the valid range? + ->set_desc("Enable locally typical sampling with parameter p (1.0 = disabled)")); + + add((new field_num("temperature", params.sampling.temp)) + ->set_limits(0.0f, std::numeric_limits::infinity()) + ->set_desc("Adjust the randomness of the generated text (0 = greedy)")); + + add((new field_num("dynatemp_range", params.sampling.dynatemp_range)) + ->set_desc("Dynamic temperature range. The final temperature will be in [temperature - range, temperature + range] (0 = disabled)")); + + add((new field_num("dynatemp_exponent", params.sampling.dynatemp_exponent)) + ->set_desc("Dynamic temperature exponent, controls how entropy maps to temperature")); + + add((new field_num("repeat_last_n", params.sampling.penalty_last_n)) + ->set_hard_limits(-1, INT32_MAX) + ->set_desc("Last n tokens to consider for penalizing repetition (0 = disabled, -1 = ctx-size)")); + + add((new field_num("repeat_penalty", params.sampling.penalty_repeat)) + ->set_desc("Control the repetition of token sequences in the generated text (1.0 = disabled)")); + + add((new field_num("frequency_penalty", params.sampling.penalty_freq)) + ->set_desc("Repeat alpha frequency penalty (0 = disabled)")); + + add((new field_num("presence_penalty", params.sampling.penalty_present)) + ->set_desc("Repeat alpha presence penalty (0 = disabled)")); + + add((new field_num("dry_multiplier", params.sampling.dry_multiplier)) + ->set_desc("Set the DRY (Don't Repeat Yourself) repetition penalty multiplier (0 = disabled)")); + + add((new field_num("dry_base", params.sampling.dry_base)) + ->set_desc("Set the DRY repetition penalty base value (must be >= 1.0, any values < 1.0 will be replaced with the default value)") + ->set_handler([&](field_eval_context & ctx, const json & data) { + float v = data.at("dry_base").get(); + ctx.params.sampling.dry_base = (v < 1.0f) ? params_base.sampling.dry_base : v; + })); + + add((new field_num("dry_allowed_length", params.sampling.dry_allowed_length)) + ->set_hard_limits(0, INT32_MAX) + ->set_desc("Tokens that extend repetition beyond this length receive exponentially increasing penalty: multiplier * base ^ (sequence_length - allowed_length)")); + + add((new field_num("dry_penalty_last_n", params.sampling.dry_penalty_last_n)) + ->set_hard_limits(-1, INT32_MAX) + ->set_desc("How many tokens to scan for repetitions (0 = disabled, -1 = context size)")); + + add((new field_num("mirostat", params.sampling.mirostat)) + ->set_limits(0, 2) + ->set_desc("Enable Mirostat sampling, controlling perplexity during text generation (0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)")); + + add((new field_num("mirostat_tau", params.sampling.mirostat_tau)) + ->set_desc("Set the Mirostat target entropy, parameter tau")); + + add((new field_num("mirostat_eta", params.sampling.mirostat_eta)) + ->set_desc("Set the Mirostat learning rate, parameter eta")); + + add((new field_num("adaptive_target", params.sampling.adaptive_target)) + ->set_limits(-std::numeric_limits::max(), 1.0f) + ->set_desc("Adaptive sampling target entropy (valid range 0.0 to 1.0; negative = disabled)")); + + add((new field_num("adaptive_decay", params.sampling.adaptive_decay)) + ->set_hard_limits(0.0f, 0.99f) + ->set_desc("EMA decay for adaptive sampling; history approximates 1/(1-decay) tokens")); + + // seed is uint32_t; field_num uses int32_t so use a handler + add((new field_num("seed", params.sampling.seed)) + ->set_desc("Set the random number generator (RNG) seed (-1 = random)")); + + add((new field_num("n_probs", params.sampling.n_probs)) + ->add_alias("logprobs") // use "logprobs" if "n_probs" wasn't provided + ->set_desc("If greater than 0, output the probabilities of top N tokens for each generated token")); + + add((new field_num("min_keep", params.sampling.min_keep)) + ->set_hard_limits(0, INT32_MAX) + ->set_desc("If greater than 0, force samplers to return at least N possible tokens")); + + add((new field_bool("backend_sampling", params.sampling.backend_sampling)) + ->set_desc("Use backend sampling instead of llama.cpp sampling")); + + add((new field_bool("post_sampling_probs", params.post_sampling_probs)) + ->set_desc("Return probabilities of top n_probs tokens after applying the sampling chain")); + + // + // Speculative decoding params + // + + // TODO: to keep things simple, we disable speculative parameter adjustments for now +#if 0 + // TODO: for now, be able to adjust only the draft-model based speculative parameters + add((new field_num("speculative.n_max", params.speculative.draft.n_max)) + ->set_hard_limits(0, INT32_MAX) + ->set_desc("Maximum number of tokens to draft during speculative decoding")); + + add((new field_num("speculative.n_min", params.speculative.draft.n_min)) + ->set_hard_limits(0, INT32_MAX) + ->set_desc("Minimum number of draft tokens to use for speculative decoding"); + + add((new field_num("speculative.p_min", params.speculative.draft.p_min)) + ->set_hard_limits(0.0f, 1.0f) + ->set_desc("Minimum speculative decoding probability for draft tokens (0 = greedy)")); + + add((new field_str("speculative.type")) + ->set_desc("Speculative decoding method (for debugging and research purposes)") + ->set_handler([&](field_eval_context & ctx, const json & data) { + ctx.params.speculative.types = { common_speculative_type_from_name(data.at("speculative.type").get()) }; + })); + + add((new field_num("speculative.ngram_size_n", params.speculative.ngram_simple.size_n)) + ->set_desc("Ngram size for lookup in ngram-based speculative decoding")); + + add((new field_num("speculative.ngram_size_m", params.speculative.ngram_simple.size_m)) + ->set_desc("Mgram size for speculative tokens in ngram-based speculative decoding")); + + add((new field_num("speculative.ngram_min_hits", params.speculative.ngram_simple.min_hits)) + ->set_desc("Minimum hits at ngram lookup for mgram to be proposed")); +#endif + + add((new field_json("lora")) + ->set_desc("A list of LoRA adapters to apply to this request. Each entry must have `id` and `scale` fields. Adapters not listed default to scale 0.0") + ->set_handler([&](field_eval_context & ctx, const json & data) { + const auto & lora = data.at("lora"); + if (!lora.is_array()) { + throw std::runtime_error("Error: 'lora' must be an array of objects with 'id' and 'scale' fields"); + } + ctx.params.lora = parse_lora_request(lora); + })); + + // sequence breakers for DRY + // Currently, this is not compatible with TextGen WebUI, Koboldcpp and SillyTavern format + // Ref: https://github.com/oobabooga/text-generation-webui/blob/d1af7a41ade7bd3c3a463bfa640725edb818ebaf/extensions/openai/typing.py#L39 + add((new field_json("dry_sequence_breakers")) + ->set_desc("Specify an array of sequence breakers for DRY sampling. Only a JSON array of strings is accepted") + ->set_handler([&](field_eval_context & ctx, const json & data) { + ctx.params.sampling.dry_sequence_breakers = json_value(data, "dry_sequence_breakers", std::vector()); + if (ctx.params.sampling.dry_sequence_breakers.empty()) { + throw std::runtime_error("Error: dry_sequence_breakers must be a non-empty array of strings"); + } + })); + + // handle both "json_schema" and "grammar" + add((new field_json("json_schema")) + ->add_alias("grammar") + ->set_desc("Set a JSON schema (json_schema) or GBNF grammar string (grammar) for constrained generation. json_schema takes precedence if both are provided") + ->set_handler([&](field_eval_context & ctx, const json & data) { + auto & params = ctx.params; + if (data.contains("json_schema") && !data.contains("grammar")) { + try { + auto schema = json_value(data, "json_schema", json::object()); + SRV_DBG("JSON schema: %s\n", schema.dump(2).c_str()); + std::string grammar_str = json_schema_to_grammar(schema); + SRV_DBG("Converted grammar: %s\n", grammar_str.c_str()); + params.sampling.grammar = {COMMON_GRAMMAR_TYPE_OUTPUT_FORMAT, std::move(grammar_str)}; + } catch (const std::exception & e) { + throw std::runtime_error(std::string("\"json_schema\": ") + e.what()); + } + } else { + std::string grammar_str = json_value(data, "grammar", std::string()); + if (!grammar_str.empty()) { + // grammar_type key is set by the server when converting chat template grammars + std::string grammar_type = json_value(data, "grammar_type", std::string()); + if (grammar_type == "tool_calls") { + params.sampling.grammar = {COMMON_GRAMMAR_TYPE_TOOL_CALLS, std::move(grammar_str)}; + } else { + // explicit grammar from the user (API field "grammar") + params.sampling.grammar = {COMMON_GRAMMAR_TYPE_USER, std::move(grammar_str)}; + } + SRV_DBG("Grammar (%s): %s\n", grammar_type.c_str(), common_grammar_value(params.sampling.grammar).c_str()); + } + } + })); + + add((new field_bool("grammar_lazy", params.sampling.grammar_lazy)) + ->set_desc("Whether to apply grammar constraints lazily, only when triggered (instead of at every step)")); + + // + // Chat parser params + // + + // TODO: change this to string field instead + add((new field_json("chat_format")) + ->set_desc("Chat format used internally by the server") + ->set_handler([&](field_eval_context & ctx, const json & data) { + ctx.params.chat_parser_params.format = static_cast(data.at("chat_format").get()); + SRV_INF("Chat format: %s\n", common_chat_format_name(ctx.params.chat_parser_params.format)); + })); + + add((new field_str("reasoning_format")) + ->set_desc("Reasoning format for chain-of-thought models") + ->set_handler([&](field_eval_context & ctx, const json & data) { + auto reasoning_format = common_reasoning_format_from_name(data.at("reasoning_format").get()); + ctx.params.chat_parser_params.reasoning_format = reasoning_format; + ctx.params.chat_parser_params.reasoning_in_content = ctx.params.stream && (reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY); + })); + + add((new field_str("generation_prompt")) + ->set_desc("Generation prompt appended to the chat template output") + ->set_handler([&](field_eval_context & ctx, const json & data) { + std::string s = data.at("generation_prompt").get(); + ctx.params.chat_parser_params.generation_prompt = s; + ctx.params.sampling.generation_prompt = s; + })); + + add((new field_bool("parse_tool_calls", params.chat_parser_params.parse_tool_calls)) + ->set_desc("Whether to parse tool calls from the generated output")); + + add((new field_str("chat_parser")) + ->set_desc("Chat parser configuration string") + ->set_handler([&](field_eval_context & ctx, const json & data) { + ctx.params.chat_parser_params.parser.load(data.at("chat_parser").get()); + })); + + add((new field_json("continue_final_message")) + ->set_desc("Whether to continue the final message of the chat template") + ->set_handler([&](field_eval_context & ctx, const json & data) { + auto continuation = common_chat_continuation_parse(data.at("continue_final_message")); + ctx.params.chat_parser_params.is_continuation = continuation != COMMON_CHAT_CONTINUATION_NONE; + })); + + add((new field_bool("echo", params.chat_parser_params.echo)) + ->set_desc("Whether to echo the input tokens in the output")); + + // + // Token-level fields (require vocab) + // + + add((new field_json("preserved_tokens")) + ->set_desc("List of token strings that must not be split during tokenization") + ->set_handler([&](field_eval_context & ctx, const json & data) { + GGML_ASSERT(ctx.vocab != nullptr); + for (const auto & t : data.at("preserved_tokens")) { + auto ids = common_tokenize(ctx.vocab, t.get(), false, true); + if (ids.size() == 1) { + ctx.params.sampling.preserved_tokens.insert(ids[0]); + } + } + })); + + add((new field_json("grammar_triggers")) + ->set_desc("List of strings or patterns that trigger grammar-constrained generation") + ->set_handler([&](field_eval_context & ctx, const json & data) { + GGML_ASSERT(ctx.vocab != nullptr); + for (const auto & t : data.at("grammar_triggers")) { + server_grammar_trigger ct(t); + if (ct.value.type == COMMON_GRAMMAR_TRIGGER_TYPE_WORD) { + const auto & word = ct.value.value; + auto ids = common_tokenize(ctx.vocab, word, false, true); + if (ids.size() == 1) { + auto token = ids[0]; + if (std::find(ctx.params.sampling.preserved_tokens.begin(), ctx.params.sampling.preserved_tokens.end(), (llama_token) token) == ctx.params.sampling.preserved_tokens.end()) { + throw std::runtime_error("Grammar trigger word should be marked as preserved token: " + word); + } + common_grammar_trigger trigger; + trigger.type = COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN; + trigger.value = word; + trigger.token = token; + ctx.params.sampling.grammar_triggers.push_back(std::move(trigger)); + } else { + ctx.params.sampling.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, word}); + } + } else { + ctx.params.sampling.grammar_triggers.emplace_back(std::move(ct.value)); + } + } + if (ctx.params.sampling.grammar_lazy && ctx.params.sampling.grammar_triggers.empty()) { + throw std::runtime_error("Error: no triggers set for lazy grammar!"); + } + })); + + add((new field_bool("reasoning_control", params.sampling.reasoning_control)) + ->set_desc("Create the budget sampler on demand so reasoning can be ended at runtime")); + + add((new field_num("reasoning_budget_tokens", params.sampling.reasoning_budget_tokens)) + ->set_hard_limits(-1, INT32_MAX) + ->set_desc("Number of tokens in the reasoning budget (-1 = disabled)")); + + add((new field_str("reasoning_budget_start_tag")) + ->set_desc("Token string marking the start of the reasoning budget section") + ->set_handler([&](field_eval_context & ctx, const json & data) { + GGML_ASSERT(ctx.vocab != nullptr); + ctx.params.sampling.reasoning_budget_start = common_tokenize(ctx.vocab, data.at("reasoning_budget_start_tag").get(), false, true); + })); + + add((new field_str("reasoning_budget_end_tag")) + ->set_desc("Token string marking the end of the reasoning budget section") + ->set_handler([&](field_eval_context & ctx, const json & data) { + GGML_ASSERT(ctx.vocab != nullptr); + std::string end_tag = data.at("reasoning_budget_end_tag").get(); + ctx.params.sampling.reasoning_budget_end = common_tokenize(ctx.vocab, end_tag, false, true); + })); + + add((new field_str("reasoning_budget_message")) + ->set_desc("Message to prepend to the reasoning budget end tag when forcing it") + ->set_handler([&](field_eval_context & ctx, const json & data) { + GGML_ASSERT(ctx.vocab != nullptr); + std::string end_tag = json_value(data, "reasoning_budget_end_tag", std::string()); + std::string message = data.at("reasoning_budget_message").get(); + ctx.params.sampling.reasoning_budget_forced = common_tokenize(ctx.vocab, message + end_tag, false, true); + })); + + add((new field_json("logit_bias")) + ->set_desc("Modify the likelihood of specific tokens. Accepts an array of [token, bias] pairs or an object mapping token to bias. Use false as bias to ban a token") + ->set_handler([&](field_eval_context & ctx, const json & data) { + GGML_ASSERT(ctx.vocab != nullptr); + ctx.params.sampling.logit_bias.clear(); + const auto & logit_bias = data.at("logit_bias"); + const int n_vocab = llama_vocab_n_tokens(ctx.vocab); + auto parse_bias = [](const json & v, float & bias) -> bool { + if (v.is_number()) { bias = v.get(); return true; } + if (v.is_boolean() && !v.get()) { bias = -INFINITY; return true; } + return false; + }; + if (logit_bias.is_array()) { + for (const auto & el : logit_bias) { + if (!el.is_array() || el.size() != 2) continue; + float bias; + if (!parse_bias(el[1], bias)) continue; + if (el[0].is_number_integer()) { + llama_token tok = el[0].get(); + if (tok >= 0 && tok < n_vocab) ctx.params.sampling.logit_bias.push_back({tok, bias}); + } else if (el[0].is_string()) { + for (auto tok : common_tokenize(ctx.vocab, el[0].get(), false)) + ctx.params.sampling.logit_bias.push_back({tok, bias}); + } + } + } else if (logit_bias.is_object()) { + for (const auto & el : logit_bias.items()) { + float bias; + if (!parse_bias(el.value(), bias)) continue; + char * end; + llama_token tok = strtol(el.key().c_str(), &end, 10); + if (*end == 0) { + if (tok >= 0 && tok < n_vocab) ctx.params.sampling.logit_bias.push_back({tok, bias}); + } else { + for (auto t : common_tokenize(ctx.vocab, el.key(), false)) + ctx.params.sampling.logit_bias.push_back({t, bias}); + } + } + } + })); + + add((new field_bool("ignore_eos", params.sampling.ignore_eos)) + ->set_desc("Ignore the end-of-sequence token and continue generating") + ->set_handler([&](field_eval_context & ctx, const json & data) { + GGML_ASSERT(ctx.logit_bias_eog != nullptr); + ctx.params.sampling.ignore_eos = data.at("ignore_eos").get(); + if (ctx.params.sampling.ignore_eos && ctx.logit_bias_eog) { + ctx.params.sampling.logit_bias.insert( + ctx.params.sampling.logit_bias.end(), + ctx.logit_bias_eog->begin(), ctx.logit_bias_eog->end()); + } + })); + + add((new field_json("stop")) + ->set_desc("Specify stopping strings. Generation stops when one is produced, and the string is not included in the output") + ->set_handler([&](field_eval_context & ctx, const json & data) { + ctx.params.antiprompt.clear(); + const auto & stop = data.at("stop"); + if (stop.is_array()) { + for (const auto & word : stop) { + if (!word.empty()) ctx.params.antiprompt.push_back(word); + } + } else if (stop.is_string()) { + ctx.params.antiprompt.push_back(stop.get()); + } + // fall back to CLI defaults if the request provided no effective stop strings + if (ctx.params.antiprompt.empty()) { + ctx.params.antiprompt = params_base.antiprompt; + } + })); + + add((new field_json("samplers")) + ->set_desc("The order in which samplers are applied. An array of sampler type names, or a single string of sampler chars") + ->set_handler([&](field_eval_context & ctx, const json & data) { + const auto & samplers = data.at("samplers"); + if (samplers.is_array()) { + ctx.params.sampling.samplers = common_sampler_types_from_names(samplers); + } else if (samplers.is_string()) { + ctx.params.sampling.samplers = common_sampler_types_from_chars(samplers.get()); + } + })); + + return fields; +} + +task_params eval_llama_cmpl_schema( + const llama_vocab * vocab, + const common_params & params_base, + const int n_ctx_slot, + const std::vector & logit_bias_eog, + const json & data) { + task_params params; + + // Sampling parameter defaults are loaded from the global server context (but individual requests can still them) + params.sampling = params_base.sampling; + params.speculative = params_base.speculative; + params.n_keep = params_base.n_keep; + params.n_predict = params_base.n_predict; + params.n_cache_reuse = params_base.n_cache_reuse; + params.cache_prompt = params_base.cache_prompt; + params.antiprompt = params_base.antiprompt; + + // enabling this will output extra debug information in the HTTP responses from the server + params.verbose = params_base.verbosity > 9; + + params.chat_parser_params.reasoning_format = params_base.reasoning_format; + + // create context and schema + field_eval_context ctx(params); + ctx.vocab = vocab; + ctx.logit_bias_eog = &logit_bias_eog; + + auto schema = make_llama_cmpl_schema(params_base, params); + + // eval all fields in the schema + for (const auto & f : schema) { + f->eval(ctx, data); + } + + // post-processing + { + if (params.sampling.penalty_last_n == -1) { + // note: should be the slot's context and not the full context, but it's ok + params.sampling.penalty_last_n = n_ctx_slot; + } + + if (params.sampling.dry_penalty_last_n == -1) { + params.sampling.dry_penalty_last_n = n_ctx_slot; + } + + // if "reasoning_format" is not provided, its handler will not be called, we will need to handle it here + auto reasoning_format = params.chat_parser_params.reasoning_format; + params.chat_parser_params.reasoning_in_content = params.stream && (reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY); + } + + // debugging + { + auto budget = params.sampling.reasoning_budget_tokens; + SRV_DBG("reasoning budget: tokens=%d, generation_prompt='%s', start=%zu toks, end=%zu toks, forced=%zu toks\n", + budget, params.sampling.generation_prompt.c_str(), + params.sampling.reasoning_budget_start.size(), + params.sampling.reasoning_budget_end.size(), + params.sampling.reasoning_budget_forced.size()); + } + + return params; +} + +// +// eval() implementations +// + +static void handle_with_catch(const char * name, std::function func) { + try { + func(); + } catch (const std::exception & e) { + throw std::invalid_argument(string_format("Field '%s': %s", name, e.what())); + } +} + +template +void field_num::eval(field_eval_context & ctx, const json & data) { + for (const auto & n : name) { + if (data.contains(n)) { + handle_with_catch(n, [&]() { + if (custom_handler) { + custom_handler(ctx, data); + } else if (!is_hard_limit) { + val = std::max(min, std::min(max, data.at(n).template get())); + } else { + T tmp = data.at(n).template get(); + if (tmp < min || tmp > max) { + throw std::invalid_argument(std::string("Value must be between ") + std::to_string(min) + " <= value <= " + std::to_string(max) + ", but got " + std::to_string(tmp)); + } + val = tmp; + } + }); + return; + } + } +} + +void field_str::eval(field_eval_context & ctx, const json & data) { + GGML_ASSERT(custom_handler); + for (const auto & n : name) { + if (data.contains(n)) { + handle_with_catch(n, [&]() { + custom_handler(ctx, data); + }); + return; + } + } +} + +void field_bool::eval(field_eval_context & ctx, const json & data) { + for (const auto & n : name) { + if (data.contains(n)) { + handle_with_catch(n, [&]() { + if (custom_handler) { + custom_handler(ctx, data); + } else { + val = data.at(n).get(); + } + }); + return; + } + } +} + +void field_json::eval(field_eval_context & ctx, const json & data) { + GGML_ASSERT(custom_handler); + for (const auto & n : name) { + if (data.contains(n)) { + handle_with_catch(n, [&]() { + custom_handler(ctx, data); + }); + return; + } + } +} + +void field_nested::eval(field_eval_context & ctx, const json & data) { + for (const auto & n : name) { + if (data.contains(n) && data.at(n).is_object()) { + for (auto & f : subfields) { + f->eval(ctx, data.at(n)); + } + return; + } + } +} + +} // namespace server_schema diff --git a/tools/server/server-schema.h b/tools/server/server-schema.h new file mode 100644 index 000000000000..08cf427dc983 --- /dev/null +++ b/tools/server/server-schema.h @@ -0,0 +1,105 @@ +#pragma once + +#include "server-common.h" +#include "server-task.h" + +#include "sampling.h" +#include "speculative.h" + +#include +#include +#include +#include +#include +#include + +namespace server_schema { + +struct field_eval_context { + task_params & params; + const llama_vocab * vocab = nullptr; + const std::vector * logit_bias_eog = nullptr; + field_eval_context(task_params & params) : params(params) {} +}; + +using field_handler = std::function; + +struct field { + std::vector name; + const char * desc = ""; + field_handler custom_handler; + field() = default; + field(const char * n) : name({n}) {} + virtual ~field() = default; + field * set_desc(const char * s) { + desc = s; + return this; + } + // if 'name' is present, use it, otherwise look for aliases following the order they were added + field * add_alias(const char * n) { + name.push_back(n); + return this; + } + field * set_handler(field_handler h) { this->custom_handler = h; return this; } + virtual void eval(field_eval_context & ctx, const json & data) = 0; +}; + +template +struct field_num : public field { + T & val; + T min = std::numeric_limits::lowest(); + T max = std::numeric_limits::max(); + bool is_hard_limit = false; // if true, throw error if the value is invalid + field_num(const char * n, T & val) : field(n), val(val) {} + // limits are inclusive, min <= value <= max + field_num * set_limits(T min, T max) { + this->min = min; + this->max = max; + return this; + } + field_num * set_hard_limits(T min, T max) { + set_limits(min, max); + is_hard_limit = true; + return this; + } + virtual void eval(field_eval_context & ctx, const json & data) override; +}; + +struct field_str : public field { + field_str(const char * n) : field(n) {} + virtual void eval(field_eval_context & ctx, const json & data) override; +}; + +struct field_bool : public field { + bool & val; + field_bool(const char * n, bool & val) : field(n), val(val) {} + virtual void eval(field_eval_context & ctx, const json & data) override; +}; + +struct field_json : public field { + field_json(const char * n) : field(n) {} + virtual void eval(field_eval_context & ctx, const json & data) override; +}; + +struct field_nested : public field { + std::vector> subfields; + field_nested(const char * n) : field(n) {} + field_nested * add_subfield(field * f) { + subfields.emplace_back(std::unique_ptr(f)); + return this; + } + virtual void eval(field_eval_context & ctx, const json & data) override; +}; + +std::vector> make_llama_cmpl_schema( + const common_params & params_base, + task_params & params); + +task_params eval_llama_cmpl_schema( + const llama_vocab * vocab, + const common_params & params_base, + const int n_ctx_slot, + const std::vector & logit_bias_eog, + const json & data); + +} // namespace server_schema diff --git a/tools/server/server-stream.cpp b/tools/server/server-stream.cpp new file mode 100644 index 000000000000..757c36ad257a --- /dev/null +++ b/tools/server/server-stream.cpp @@ -0,0 +1,569 @@ +#include "server-stream.h" +#include "server-common.h" +#include "server-http.h" +#include "server-queue.h" + +#include +#include +#include + +namespace { +constexpr int64_t STREAM_SESSION_TTL_SECONDS = 300; +constexpr size_t STREAM_SESSION_MAX_BYTES = 4 * 1024 * 1024; +constexpr int64_t STREAM_SESSION_GC_INTERVAL_SECONDS = 60; +constexpr int64_t STREAM_READ_WAKE_INTERVAL_MS = 200; + +// returns unix time in seconds +int64_t now_seconds() { + return std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch() + ).count(); +} +} + +stream_session::stream_session(std::string conversation_id_, size_t max_bytes_) + : conversation_id(std::move(conversation_id_)) + , started_ts(now_seconds()) + , prefix_dropped(0) + , cap_bytes(max_bytes_) + , done(false) + , cancelled(false) + , completed_ts(0) { + buffer.reserve(64 * 1024); +} + +bool stream_session::append(const char * data, size_t len) { + if (len == 0) { + return true; + } + { + std::lock_guard lock(mu); + if (done.load(std::memory_order_relaxed)) { + return false; + } + if (len >= cap_bytes) { + // single chunk bigger than the cap, keep only the tail that fits + size_t skip = len - cap_bytes; + prefix_dropped += buffer.size() + skip; + buffer.clear(); + buffer.insert(buffer.end(), data + skip, data + len); + } else { + size_t needed = buffer.size() + len; + if (needed > cap_bytes) { + size_t to_drop = needed - cap_bytes; + buffer.erase(buffer.begin(), buffer.begin() + to_drop); + prefix_dropped += to_drop; + } + buffer.insert(buffer.end(), data, data + len); + } + } + cv.notify_all(); + return true; +} + +void stream_session::finalize() { + bool was_done = done.exchange(true, std::memory_order_acq_rel); + if (was_done) { + return; + } + completed_ts.store(now_seconds(), std::memory_order_release); + cv.notify_all(); +} + +stream_read_status stream_session::read_from(size_t offset, + const std::function & sink, + const std::function & should_stop) { + std::unique_lock lock(mu); + while (true) { + if (should_stop && should_stop()) { + return stream_read_status::OK; + } + if (offset < prefix_dropped) { + return stream_read_status::OFFSET_LOST; + } + size_t logical_end = prefix_dropped + buffer.size(); + if (offset < logical_end) { + size_t local_off = offset - prefix_dropped; + size_t n = buffer.size() - local_off; + // copy the available chunk under the lock, release before calling the sink + std::vector chunk(buffer.begin() + local_off, buffer.begin() + local_off + n); + offset += n; + lock.unlock(); + bool keep_going = sink(chunk.data(), chunk.size()); + if (!keep_going) { + return stream_read_status::OK; + } + lock.lock(); + continue; + } + if (done.load(std::memory_order_acquire)) { + return stream_read_status::OK; + } + // wait for new bytes, finalize, or a periodic wake to re check should_stop + cv.wait_for(lock, std::chrono::milliseconds(STREAM_READ_WAKE_INTERVAL_MS)); + } +} + +bool stream_session::is_done() const { + return done.load(std::memory_order_acquire); +} + +size_t stream_session::total_size() const { + std::lock_guard lock(mu); + return prefix_dropped + buffer.size(); +} + +size_t stream_session::dropped_prefix() const { + std::lock_guard lock(mu); + return prefix_dropped; +} + +int64_t stream_session::completed_at() const { + return completed_ts.load(std::memory_order_acquire); +} + +void stream_session::set_stop_producer(std::function fn) { + std::lock_guard lock(mu); + stop_producer = std::move(fn); +} + +void stream_session::cancel() { + // flip cancelled first so the producer-side stream_aware_should_stop can break out of the + // recv() wait even if remove_waiting_task_ids does not notify the condvar (the cancel task + // posted by rd.stop() will eventually notify, but we do not want to depend on that timing) + cancelled.store(true, std::memory_order_release); + // copy the hook under the lock then invoke outside, the producer side may grab queue locks + // and we do not want to hold our mu across that path + std::function fn; + { + std::lock_guard lock(mu); + fn = stop_producer; + } + if (fn) { + fn(); + } +} + +bool stream_session::is_cancelled() const { + return cancelled.load(std::memory_order_acquire); +} + +stream_session_manager::stream_session_manager() + : running(false) { +} + +stream_session_manager::~stream_session_manager() { + stop_gc(); +} + +stream_session_ptr stream_session_manager::create_or_replace(const std::string & conversation_id) { + // evict any previous session on the same conv, this guarantees the invariant + // "one conv = at most one live session" and propagates cancel to its producer + stream_session_ptr previous; + auto fresh = std::make_shared(conversation_id, STREAM_SESSION_MAX_BYTES); + { + std::unique_lock lock(map_mu); + auto it = sessions.find(conversation_id); + if (it != sessions.end()) { + previous = it->second; + it->second = fresh; + } else { + sessions.emplace(conversation_id, fresh); + } + } + if (previous) { + previous->cancel(); + previous->finalize(); + } + return fresh; +} + +stream_session_ptr stream_session_manager::get(const std::string & conversation_id) { + std::shared_lock lock(map_mu); + auto it = sessions.find(conversation_id); + if (it == sessions.end()) { + return nullptr; + } + return it->second; +} + +std::vector stream_session_manager::list_all() const { + std::vector out; + std::shared_lock lock(map_mu); + out.reserve(sessions.size()); + for (auto & kv : sessions) { + out.push_back(kv.second); + } + return out; +} + +void stream_session_manager::evict(const std::string & conversation_id) { + stream_session_ptr s; + { + std::unique_lock lock(map_mu); + auto it = sessions.find(conversation_id); + if (it == sessions.end()) { + return; + } + s = it->second; + sessions.erase(it); + } + // finalize outside the map lock so any pending readers wake up and exit + s->finalize(); +} + +void stream_session_manager::evict_and_cancel(const std::string & conversation_id) { + stream_session_ptr s; + { + std::unique_lock lock(map_mu); + auto it = sessions.find(conversation_id); + if (it == sessions.end()) { + return; + } + s = it->second; + sessions.erase(it); + } + // signal the producer side first so the inference is cancelled at the queue level, + // then finalize, which wakes any pending HTTP reader and lets the drain exit naturally + s->cancel(); + s->finalize(); +} + +void stream_session_manager::start_gc() { + if (running.exchange(true)) { + return; + } + gc_thread = std::thread([this] { gc_loop(); }); +} + +void stream_session_manager::stop_gc() { + bool was_running = running.exchange(false); + if (was_running) { + { + std::lock_guard lock(gc_wake_mu); + } + gc_wake_cv.notify_all(); + if (gc_thread.joinable()) { + gc_thread.join(); + } + } + // finalize all live sessions so no reader ever hangs + std::vector snapshot; + { + std::unique_lock lock(map_mu); + snapshot.reserve(sessions.size()); + for (auto & kv : sessions) { + snapshot.push_back(kv.second); + } + sessions.clear(); + } + for (auto & s : snapshot) { + s->finalize(); + } +} + +void stream_session_manager::gc_loop() { + while (running.load(std::memory_order_acquire)) { + { + std::unique_lock lock(gc_wake_mu); + gc_wake_cv.wait_for(lock, + std::chrono::seconds(STREAM_SESSION_GC_INTERVAL_SECONDS), + [this] { return !running.load(std::memory_order_acquire); }); + } + if (!running.load(std::memory_order_acquire)) { + return; + } + int64_t cutoff = now_seconds() - STREAM_SESSION_TTL_SECONDS; + std::vector to_drop; + { + std::unique_lock lock(map_mu); + for (auto it = sessions.begin(); it != sessions.end(); ) { + int64_t completed = it->second->completed_at(); + if (completed != 0 && completed <= cutoff) { + to_drop.push_back(it->second); + it = sessions.erase(it); + } else { + ++it; + } + } + } + // finalize outside the map lock, idempotent if the session was already done + for (auto & s : to_drop) { + s->finalize(); + } + } +} + +// process wide manager, lifecycle controlled by llama-server main() via start_gc/stop_gc +stream_session_manager g_stream_sessions; + +// stream_pipe --------------------------------------------------------------------------------- + +stream_pipe::stream_pipe(stream_session_ptr session) + : session_(std::move(session)) { +} + +bool stream_pipe::is_cancelled() const { + return session_->is_cancelled(); +} + +// stream_pipe_producer + +stream_pipe_producer::stream_pipe_producer(stream_session_ptr session) + : stream_pipe(std::move(session)) { +} + +stream_pipe_producer::~stream_pipe_producer() { + cleanup(); + session_->finalize(); +} + +void stream_pipe_producer::cleanup() { + if (!alive_) { + return; + } + alive_->store(false, std::memory_order_release); + session_->set_stop_producer(nullptr); + alive_.reset(); +} + +bool stream_pipe_producer::write(const char * data, size_t len) { + return session_->append(data, len); +} + +void stream_pipe_producer::done() { + done_ = true; +} + +void stream_pipe_producer::close() { + // httplib bails its content provider the moment is_peer_alive() goes false, so pump the rest + // of the generation into the ring buffer here. a DELETE flips is_cancelled and cuts it short + if (done_ || session_->is_cancelled()) { + SRV_INF("stream_pipe close: skip drain (done=%d cancelled=%d) conv=%s\n", + done_ ? 1 : 0, session_->is_cancelled() ? 1 : 0, session_->conversation_id.c_str()); + return; + } + SRV_INF("stream_pipe close: draining conv=%s\n", session_->conversation_id.c_str()); + size_t drained = 0; + std::string chunk; + while (true) { + chunk.clear(); + bool has_next = res_->next(chunk); + if (!chunk.empty()) { + write(chunk.data(), chunk.size()); + drained += chunk.size(); + } + if (!has_next) { + break; + } + } + SRV_INF("stream_pipe close: drain ended conv=%s bytes=%zu\n", session_->conversation_id.c_str(), drained); +} + +std::shared_ptr stream_pipe_producer::create(stream_session_ptr session, + server_http_res & res) { + auto alive = std::make_shared>(true); + auto * res_ptr = &res; + session->set_stop_producer([alive, res_ptr]() { + if (alive->load(std::memory_order_acquire)) { + res_ptr->stop(); + } + }); + auto pipe = std::shared_ptr(new stream_pipe_producer(std::move(session))); + pipe->alive_ = std::move(alive); + pipe->res_ = res_ptr; + return pipe; +} + +// stream_pipe_consumer + +stream_pipe_consumer::stream_pipe_consumer(stream_session_ptr session) + : stream_pipe(std::move(session)) { +} + +stream_read_status stream_pipe_consumer::read(size_t & offset, + const std::function & sink, + const std::function & should_stop) { + return session_->read_from(offset, sink, should_stop); +} + +std::shared_ptr stream_pipe_consumer::create(stream_session_ptr session) { + return std::shared_ptr(new stream_pipe_consumer(std::move(session))); +} + +// helper, builds the standard error response and assigns it to a brand new http_res +static server_http_res_ptr make_error_response(int status, const std::string & message, error_type type) { + auto res = std::make_unique(); + json err = format_error_response(message, type); + res->status = json_value(err, "code", status); + res->content_type = "application/json; charset=utf-8"; + res->data = safe_json_to_str({{"error", err}}); + return res; +} + +server_http_context::handler_t make_stream_get_handler() { + return [](const server_http_req & req) -> server_http_res_ptr { + // GET /v1/stream/?from=N replays the SSE bytes already buffered for the + // session, blocks for more bytes when the session is still running, returns when + // the session is finalized. the body is streamed back as text/event-stream so the + // browser EventSource can attach to it like a fresh request + std::string conv_id = req.get_param("conv_id"); + if (conv_id.empty()) { + return make_error_response(400, "Missing conversation id in path", ERROR_TYPE_INVALID_REQUEST); + } + auto session = g_stream_sessions.get(conv_id); + if (!session) { + return make_error_response(404, "Stream not found or expired", ERROR_TYPE_NOT_FOUND); + } + size_t from = 0; + std::string from_str = req.get_param("from"); + if (!from_str.empty()) { + try { + from = static_cast(std::stoull(from_str)); + } catch (const std::exception &) { + return make_error_response(400, "Invalid 'from' offset", ERROR_TYPE_INVALID_REQUEST); + } + } + if (from < session->dropped_prefix()) { + return make_error_response(400, "Stream offset lost, please restart", ERROR_TYPE_INVALID_REQUEST); + } + auto res = std::make_unique(); + res->status = 200; + res->content_type = "text/event-stream"; + // the next closure reads from the ring buffer at the requested offset, blocks until + // bytes arrive or the session finalizes. exit each call after draining the available + // chunk so set_chunked_content_provider gets a chance to flush to the socket + auto offset_ptr = std::make_shared(from); + // consumer pipe: read-only, does not finalize the session on destruction + auto pipe = stream_pipe_consumer::create(session); + res->next = [pipe, offset_ptr, &req](std::string & output) -> bool { + bool got_any = false; + pipe->read(*offset_ptr, + [&](const char * d, size_t n) { + output.append(d, n); + *offset_ptr += n; + got_any = true; + return false; + }, + req.should_stop); + return got_any; + }; + return res; + }; +} + +server_http_context::handler_t make_streams_lookup_handler() { + return [](const server_http_req & req) -> server_http_res_ptr { + // POST /v1/streams/lookup with body {"conversation_ids": ["X", "Y", ...]} returns the + // matching sessions, only for ids the caller already knows. each id matches the exact key + // and any "::" variant, so one lookup covers every per model session for a conv + std::vector requested; + try { + json body = json::parse(req.body); + if (body.contains("conversation_ids") && body["conversation_ids"].is_array()) { + for (const auto & v : body["conversation_ids"]) { + if (v.is_string()) { + std::string id = v.get(); + if (!id.empty()) { + requested.push_back(std::move(id)); + } + } + } + } + } catch (const std::exception & e) { + auto res = std::make_unique(); + res->status = 400; + res->content_type = "application/json; charset=utf-8"; + res->data = safe_json_to_str({{"error", {{"message", std::string("invalid body: ") + e.what()}, + {"type", "invalid_request_error"}}}}); + return res; + } + + std::vector sessions; + if (!requested.empty()) { + auto all = g_stream_sessions.list_all(); + for (const auto & rid : requested) { + const std::string with_sep = rid + "::"; + for (auto & s : all) { + if (s->conversation_id == rid || + s->conversation_id.compare(0, with_sep.size(), with_sep) == 0) { + sessions.push_back(s); + } + } + } + } + + json arr = json::array(); + for (auto & s : sessions) { + arr.push_back({ + {"conversation_id", s->conversation_id}, + {"is_done", s->is_done()}, + {"total_bytes", s->total_size()}, + {"started_at", s->started_ts}, + {"completed_at", s->completed_at()}, + }); + } + auto res = std::make_unique(); + res->status = 200; + res->content_type = "application/json; charset=utf-8"; + res->data = safe_json_to_str(arr); + return res; + }; +} + +server_http_context::handler_t make_stream_delete_handler() { + return [](const server_http_req & req) -> server_http_res_ptr { + // DELETE /v1/stream/ is the explicit user Stop, cancels the producer hook + // wired by handle_completions_impl and evicts the buffer. idempotent, a session that + // already finalized or was never created returns 204 either way + std::string conv_id = req.get_param("conv_id"); + if (conv_id.empty()) { + return make_error_response(400, "Missing conversation id in path", ERROR_TYPE_INVALID_REQUEST); + } + SRV_INF("DELETE /v1/stream/%s -> evict_and_cancel\n", conv_id.c_str()); + g_stream_sessions.evict_and_cancel(conv_id); + auto res = std::make_unique(); + res->status = 204; + res->content_type = "application/json"; + return res; + }; +} + +std::string stream_conv_id_from_headers(const std::map & headers) { + // case-insensitive scan for x-conversation-id + static constexpr char target[] = "x-conversation-id"; + static constexpr size_t target_len = sizeof(target) - 1; + for (const auto & [hk, hv] : headers) { + if (hk.size() != target_len) continue; + bool match = true; + for (size_t i = 0; i < target_len; ++i) { + char c = hk[i]; + if (c >= 'A' && c <= 'Z') c = char(c + 32); + if (c != target[i]) { match = false; break; } + } + if (match) { + return hv; + } + } + return std::string(); +} + +void stream_session_attach_pipe(server_http_res & res, const std::map & headers) { + std::string conversation_id = stream_conv_id_from_headers(headers); + SRV_INF("stream_session_attach_pipe: conv_id=%s (empty=%d)\n", + conversation_id.c_str(), conversation_id.empty() ? 1 : 0); + if (conversation_id.empty()) { + return; + } + auto session = g_stream_sessions.create_or_replace(conversation_id); + res.spipe = stream_pipe_producer::create(session, res); +} + +std::function stream_aware_should_stop(server_http_res * res, std::function fallback) { + return [res, fallback = std::move(fallback)]() -> bool { + if (res->spipe) { + return res->spipe->is_cancelled(); + } + return fallback(); + }; +} diff --git a/tools/server/server-stream.h b/tools/server/server-stream.h new file mode 100644 index 000000000000..ff363bb4cd7f --- /dev/null +++ b/tools/server/server-stream.h @@ -0,0 +1,203 @@ +#pragma once + +#include "server-http.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +enum class stream_read_status { + OK, + OFFSET_LOST, +}; + +// streaming buffer for one generation, survives HTTP disconnect. the producer appends raw SSE +// bytes, readers drain from any offset via read_from and block until more bytes or finalize. +// keyed by conversation_id: one conv = at most one live session +struct stream_session { + std::string conversation_id; + int64_t started_ts; // unix seconds at construction, used by /v1/streams listing + + stream_session(std::string conversation_id_, size_t max_bytes_); + stream_session(const stream_session &) = delete; + stream_session & operator=(const stream_session &) = delete; + + // append raw bytes, drops from the front if the cap is reached. + // returns false if the session is already finalized + bool append(const char * data, size_t len); + + // mark the session as complete, wakes all pending readers + void finalize(); + + // drain bytes from offset, calling sink for each chunk. blocks until more + // bytes arrive or finalize is called. returns OK on clean exit, OFFSET_LOST + // if offset falls below the dropped prefix + stream_read_status read_from(size_t offset, + const std::function & sink, + const std::function & should_stop); + + bool is_done() const; + bool is_cancelled() const; + size_t total_size() const; // bytes that ever entered the session + size_t dropped_prefix() const; // bytes evicted from the front due to cap + int64_t completed_at() const; // 0 while alive, unix seconds after finalize + + // attach the producer stop hook used to cancel its reader, pass an empty function to detach + void set_stop_producer(std::function fn); + + // signal the producer to abort its inference asap via the stop hook, idempotent + void cancel(); + +private: + mutable std::mutex mu; + std::condition_variable cv; + std::vector buffer; + size_t prefix_dropped; + size_t cap_bytes; + std::atomic done; + std::atomic cancelled; + std::atomic completed_ts; + std::function stop_producer; // protected by mu +}; + +using stream_session_ptr = std::shared_ptr; + +// one end of a stream_session pipe. the base holds the session and the shared query, the +// producer and consumer ends derive from it. virtual dtor so each end runs its own teardown: +// the producer finalizes the session, the consumer leaves it untouched +struct stream_pipe { + virtual ~stream_pipe() = default; + + // true if the session was cancelled (e.g. via DELETE /v1/stream/) + bool is_cancelled() const; + +protected: + explicit stream_pipe(stream_session_ptr session); + + stream_session_ptr session_; +}; + +// producer end: writes chunks into the ring buffer and owns the session lifetime, finalizing it +// on destruction. +// +// lifetime safety: holds a shared_ptr> alive also captured by the session's +// stop_producer hook. cleanup() sets alive=false and clears the hook; it must run while the +// response the hook calls stop() on is still alive. ~server_res_generator() does this explicitly. +struct stream_pipe_producer : stream_pipe { + ~stream_pipe_producer() override; + + // append raw bytes to the session's ring buffer, returns false if already finalized + bool write(const char * data, size_t len); + + // mark the natural end on the wire so a later close() is a no-op + void done(); + + // on a peer drop, pump the response next() into the ring buffer until done. runs on the http + // worker from on_complete, no-op after done() or cancel + void close(); + + // disarm the stop hook and drop the alive guard, must run while the response the hook + // references is still alive. idempotent, the destructor calls it too + void cleanup(); + + // res.stop() is invoked when the session is cancelled, the alive guard ensures stop() is not + // called after cleanup() has run + static std::shared_ptr create(stream_session_ptr session, server_http_res & res); + +private: + explicit stream_pipe_producer(stream_session_ptr session); + + bool done_ = false; + std::shared_ptr> alive_; + server_http_res * res_ = nullptr; +}; + +// consumer end: read-only replay of the ring buffer, the destructor does not finalize the session +struct stream_pipe_consumer : stream_pipe { + // drain bytes from offset, calling sink for each available chunk. blocks until more data + // arrives or the session finalizes. should_stop is polled, returns OFFSET_LOST if offset + // fell below the dropped prefix + stream_read_status read(size_t & offset, + const std::function & sink, + const std::function & should_stop); + + static std::shared_ptr create(stream_session_ptr session); + +private: + explicit stream_pipe_consumer(stream_session_ptr session); +}; + +// owns all live sessions, runs a periodic GC to evict expired ones. +// the map is keyed by conversation_id, so the invariant "one conv = at most one +// live session" is enforced at the type level +class stream_session_manager { +public: + stream_session_manager(); + ~stream_session_manager(); + + stream_session_manager(const stream_session_manager &) = delete; + stream_session_manager & operator=(const stream_session_manager &) = delete; + + // install a new session for this conversation, evicting and cancelling any previous one. + // the conversation_id must be non empty, the caller is responsible for that check. + // returns the new session + stream_session_ptr create_or_replace(const std::string & conversation_id); + + // lookup, returns null if unknown or already evicted + stream_session_ptr get(const std::string & conversation_id); + + // list every live or recently completed session, used by GET /v1/streams without filter + std::vector list_all() const; + + // remove from the map and finalize, wakes any pending readers + void evict(const std::string & conversation_id); + + // signal the producer to cancel asap then evict, used by the explicit user Stop path + void evict_and_cancel(const std::string & conversation_id); + + void start_gc(); + void stop_gc(); + +private: + void gc_loop(); + + mutable std::shared_mutex map_mu; + std::unordered_map sessions; // key: conversation_id + std::thread gc_thread; + std::atomic running; + std::mutex gc_wake_mu; + std::condition_variable gc_wake_cv; +}; + +// process wide manager, linked by both llama-server and llama-cli. llama-server main() drives +// start_gc/stop_gc, llama-cli leaves it idle. the dtor calls stop_gc() unconditionally so exit +// is safe whether or not the GC thread ran +extern stream_session_manager g_stream_sessions; + +// route handler factories operating on g_stream_sessions, wired under /v1/stream/* by server.cpp. +// keeps the resumable stream surface confined to server-stream +server_http_context::handler_t make_stream_get_handler(); +server_http_context::handler_t make_streams_lookup_handler(); +server_http_context::handler_t make_stream_delete_handler(); + +// extract the X-Conversation-Id header value (case-insensitive), empty when absent. exposed so +// the router can track which child serves a forwarded POST +std::string stream_conv_id_from_headers(const std::map & headers); + +// on an X-Conversation-Id header, create or replace the session and attach a producer pipe to +// res. no-op when absent, called from the server_res_generator constructor +void stream_session_attach_pipe(server_http_res & res, const std::map & headers); + +// should_stop closure that ignores peer disconnect when a pipe is attached, so only an explicit +// DELETE stops the producer and generation keeps flowing into the ring buffer. without a pipe it +// delegates to fallback, the legacy non-resumable flow +std::function stream_aware_should_stop(server_http_res * res, std::function fallback); diff --git a/tools/server/server-task.cpp b/tools/server/server-task.cpp index 72a4bd076ad7..a9ebac013f81 100644 --- a/tools/server/server-task.cpp +++ b/tools/server/server-task.cpp @@ -233,395 +233,7 @@ common_chat_msg task_result_state::update_chat_msg( } // -// server_task -// - -task_params server_task::params_from_json_cmpl( - const llama_vocab * vocab, - const common_params & params_base, - const int n_ctx_slot, - const std::vector & logit_bias_eog, - const json & data) { - task_params params; - - // Sampling parameter defaults are loaded from the global server context (but individual requests can still them) - task_params defaults; - defaults.sampling = params_base.sampling; - defaults.speculative = params_base.speculative; - defaults.n_keep = params_base.n_keep; - defaults.n_predict = params_base.n_predict; - defaults.n_cache_reuse = params_base.n_cache_reuse; - defaults.cache_prompt = params_base.cache_prompt; - defaults.antiprompt = params_base.antiprompt; - - // enabling this will output extra debug information in the HTTP responses from the server - params.verbose = params_base.verbosity > 9; - params.timings_per_token = json_value(data, "timings_per_token", false); - - params.stream = json_value(data, "stream", false); - auto stream_opt = json_value(data, "stream_options", json::object()); - params.include_usage = json_value(stream_opt, "include_usage", false); - params.cache_prompt = json_value(data, "cache_prompt", defaults.cache_prompt); - params.return_tokens = json_value(data, "return_tokens", false); - params.return_progress = json_value(data, "return_progress", false); - auto max_tokens = json_value(data, "max_tokens", defaults.n_predict); - params.n_predict = json_value(data, "n_predict", json_value(data, "max_completion_tokens", max_tokens)); - params.n_indent = json_value(data, "n_indent", defaults.n_indent); - params.n_keep = json_value(data, "n_keep", defaults.n_keep); - params.n_discard = json_value(data, "n_discard", defaults.n_discard); - params.n_discard = std::max(0, params.n_discard); - params.n_cmpl = json_value(data, "n_cmpl", json_value(data, "n", 1)); - params.n_cache_reuse = json_value(data, "n_cache_reuse", defaults.n_cache_reuse); - //params.t_max_prompt_ms = json_value(data, "t_max_prompt_ms", defaults.t_max_prompt_ms); // TODO: implement - params.t_max_predict_ms = json_value(data, "t_max_predict_ms", defaults.t_max_predict_ms); - params.response_fields = json_value(data, "response_fields", std::vector()); - - params.sampling.top_k = json_value(data, "top_k", defaults.sampling.top_k); - params.sampling.top_p = json_value(data, "top_p", defaults.sampling.top_p); - params.sampling.min_p = json_value(data, "min_p", defaults.sampling.min_p); - params.sampling.top_n_sigma = json_value(data, "top_n_sigma", defaults.sampling.top_n_sigma); - params.sampling.xtc_probability = json_value(data, "xtc_probability", defaults.sampling.xtc_probability); - params.sampling.xtc_threshold = json_value(data, "xtc_threshold", defaults.sampling.xtc_threshold); - params.sampling.typ_p = json_value(data, "typical_p", defaults.sampling.typ_p); - params.sampling.temp = json_value(data, "temperature", defaults.sampling.temp); - params.sampling.dynatemp_range = json_value(data, "dynatemp_range", defaults.sampling.dynatemp_range); - params.sampling.dynatemp_exponent = json_value(data, "dynatemp_exponent", defaults.sampling.dynatemp_exponent); - params.sampling.penalty_last_n = json_value(data, "repeat_last_n", defaults.sampling.penalty_last_n); - params.sampling.penalty_repeat = json_value(data, "repeat_penalty", defaults.sampling.penalty_repeat); - params.sampling.penalty_freq = json_value(data, "frequency_penalty", defaults.sampling.penalty_freq); - params.sampling.penalty_present = json_value(data, "presence_penalty", defaults.sampling.penalty_present); - params.sampling.dry_multiplier = json_value(data, "dry_multiplier", defaults.sampling.dry_multiplier); - params.sampling.dry_base = json_value(data, "dry_base", defaults.sampling.dry_base); - params.sampling.dry_allowed_length = json_value(data, "dry_allowed_length", defaults.sampling.dry_allowed_length); - params.sampling.dry_penalty_last_n = json_value(data, "dry_penalty_last_n", defaults.sampling.dry_penalty_last_n); - params.sampling.mirostat = json_value(data, "mirostat", defaults.sampling.mirostat); - params.sampling.mirostat_tau = json_value(data, "mirostat_tau", defaults.sampling.mirostat_tau); - params.sampling.mirostat_eta = json_value(data, "mirostat_eta", defaults.sampling.mirostat_eta); - params.sampling.adaptive_target = json_value(data, "adaptive_target", defaults.sampling.adaptive_target); - params.sampling.adaptive_decay = json_value(data, "adaptive_decay", defaults.sampling.adaptive_decay); - params.sampling.seed = json_value(data, "seed", defaults.sampling.seed); - params.sampling.n_probs = json_value(data, "n_probs", defaults.sampling.n_probs); - params.sampling.min_keep = json_value(data, "min_keep", defaults.sampling.min_keep); - params.sampling.backend_sampling = json_value(data, "backend_sampling", defaults.sampling.backend_sampling); - params.post_sampling_probs = json_value(data, "post_sampling_probs", defaults.post_sampling_probs); - - params.speculative = defaults.speculative; - - // TODO: to keep things simple, we disable speculative parameter adjustments for now -#if 0 - // TODO: for now, be able to adjust only the draft-model based speculative parameters - params.speculative.draft.n_min = json_value(data, "speculative.n_min", defaults.speculative.draft.n_min); - params.speculative.draft.n_max = json_value(data, "speculative.n_max", defaults.speculative.draft.n_max); - params.speculative.draft.p_min = json_value(data, "speculative.p_min", defaults.speculative.draft.p_min); - - params.speculative.draft.n_min = std::min(params.speculative.draft.n_max, params.speculative.draft.n_min); - params.speculative.draft.n_min = std::max(params.speculative.draft.n_min, 0); - params.speculative.draft.n_max = std::max(params.speculative.draft.n_max, 0); - - // for debugging and research purposes - params.speculative.type = common_speculative_type_from_name(json_value(data, "speculative.type", common_speculative_type_to_str(defaults.speculative.type))); - - params.speculative.ngram_size_n = json_value(data, "speculative.ngram_size_n", defaults.speculative.ngram_size_n); - params.speculative.ngram_size_m = json_value(data, "speculative.ngram_size_m", defaults.speculative.ngram_size_m); - params.speculative.ngram_min_hits = json_value(data, "speculative.ngram_m_hits", defaults.speculative.ngram_min_hits); - - params.speculative.ngram_size_n = std::max(std::min(1, (int) params.speculative.ngram_size_n), 1024); - params.speculative.ngram_size_m = std::max(std::min(1, (int) params.speculative.ngram_size_m), 1024); - params.speculative.ngram_min_hits = std::max(std::min(1, (int) params.speculative.ngram_min_hits), 1024); -#endif - - // Use OpenAI API logprobs only if n_probs wasn't provided - if (data.contains("logprobs") && params.sampling.n_probs == defaults.sampling.n_probs){ - params.sampling.n_probs = json_value(data, "logprobs", defaults.sampling.n_probs); - } - - if (data.contains("lora")) { - if (data.at("lora").is_array()) { - params.lora = parse_lora_request(data.at("lora")); - } else { - throw std::runtime_error("Error: 'lora' must be an array of objects with 'id' and 'scale' fields"); - } - } else { - params.lora = {}; - } - - // TODO: add more sanity checks for the input parameters - - if (params.sampling.penalty_last_n < -1) { - throw std::runtime_error("Error: repeat_last_n must be >= -1"); - } - - if (params.sampling.dry_penalty_last_n < -1) { - throw std::runtime_error("Error: dry_penalty_last_n must be >= -1"); - } - - if (params.sampling.penalty_last_n == -1) { - // note: should be the slot's context and not the full context, but it's ok - params.sampling.penalty_last_n = n_ctx_slot; - } - - if (params.sampling.dry_penalty_last_n == -1) { - params.sampling.dry_penalty_last_n = n_ctx_slot; - } - - if (params.sampling.dry_base < 1.0f) { - params.sampling.dry_base = defaults.sampling.dry_base; - } - - // sequence breakers for DRY - { - // Currently, this is not compatible with TextGen WebUI, Koboldcpp and SillyTavern format - // Ref: https://github.com/oobabooga/text-generation-webui/blob/d1af7a41ade7bd3c3a463bfa640725edb818ebaf/extensions/openai/typing.py#L39 - - if (data.contains("dry_sequence_breakers")) { - params.sampling.dry_sequence_breakers = json_value(data, "dry_sequence_breakers", std::vector()); - if (params.sampling.dry_sequence_breakers.empty()) { - throw std::runtime_error("Error: dry_sequence_breakers must be a non-empty array of strings"); - } - } - } - - // process "json_schema" and "grammar" - if (data.contains("json_schema") && !data.contains("grammar")) { - try { - auto schema = json_value(data, "json_schema", json::object()); - SRV_DBG("JSON schema: %s\n", schema.dump(2).c_str()); - std::string grammar_str = json_schema_to_grammar(schema); - SRV_DBG("Converted grammar: %s\n", grammar_str.c_str()); - params.sampling.grammar = {COMMON_GRAMMAR_TYPE_OUTPUT_FORMAT, std::move(grammar_str)}; - } catch (const std::exception & e) { - throw std::runtime_error(std::string("\"json_schema\": ") + e.what()); - } - } else { - params.sampling.grammar = defaults.sampling.grammar; - - std::string grammar_str = json_value(data, "grammar", std::string()); - if (!grammar_str.empty()) { - // grammar_type key is set by the server when converting chat template grammars - std::string grammar_type = json_value(data, "grammar_type", std::string()); - if (grammar_type == "tool_calls") { - params.sampling.grammar = {COMMON_GRAMMAR_TYPE_TOOL_CALLS, std::move(grammar_str)}; - } else { - // explicit grammar from the user (API field "grammar") - params.sampling.grammar = {COMMON_GRAMMAR_TYPE_USER, std::move(grammar_str)}; - } - SRV_DBG("Grammar (%s): %s\n", grammar_type.c_str(), common_grammar_value(params.sampling.grammar).c_str()); - } - params.sampling.grammar_lazy = json_value(data, "grammar_lazy", defaults.sampling.grammar_lazy); - SRV_DBG("Grammar lazy: %s\n", params.sampling.grammar_lazy ? "true" : "false"); - } - - { - auto it = data.find("chat_format"); - if (it != data.end()) { - params.chat_parser_params.format = static_cast(it->get()); - SRV_INF("Chat format: %s\n", common_chat_format_name(params.chat_parser_params.format)); - } else { - params.chat_parser_params.format = defaults.chat_parser_params.format; - } - common_reasoning_format reasoning_format = params_base.reasoning_format; - if (data.contains("reasoning_format")) { - reasoning_format = common_reasoning_format_from_name(data.at("reasoning_format").get()); - } - params.chat_parser_params.reasoning_format = reasoning_format; - params.chat_parser_params.reasoning_in_content = params.stream && (reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY); - params.chat_parser_params.generation_prompt = json_value(data, "generation_prompt", std::string()); - params.sampling.generation_prompt = params.chat_parser_params.generation_prompt; - SRV_DBG("Generation prompt: '%s'\n", params.chat_parser_params.generation_prompt.c_str()); - params.chat_parser_params.parse_tool_calls = json_value(data, "parse_tool_calls", false); - if (data.contains("chat_parser")) { - params.chat_parser_params.parser.load(data.at("chat_parser").get()); - } - if (data.contains("continue_final_message")) { - auto continuation = common_chat_continuation_parse(data.at("continue_final_message")); - params.chat_parser_params.is_continuation = continuation != COMMON_CHAT_CONTINUATION_NONE; - } - params.chat_parser_params.echo = json_value(data, "echo", false); - } - - { - const auto preserved_tokens = data.find("preserved_tokens"); - if (preserved_tokens != data.end()) { - for (const auto & t : *preserved_tokens) { - auto ids = common_tokenize(vocab, t.get(), /* add_special= */ false, /* parse_special= */ true); - if (ids.size() == 1) { - SRV_DBG("Preserved token: %d\n", ids[0]); - params.sampling.preserved_tokens.insert(ids[0]); - } else { - // This may happen when using a tool call style meant for a model with special tokens to preserve on a model without said tokens. - SRV_DBG("Not preserved because more than 1 token: %s\n", t.get().c_str()); - } - } - } - const auto grammar_triggers = data.find("grammar_triggers"); - if (grammar_triggers != data.end()) { - for (const auto & t : *grammar_triggers) { - server_grammar_trigger ct(t); - if (ct.value.type == COMMON_GRAMMAR_TRIGGER_TYPE_WORD) { - const auto & word = ct.value.value; - auto ids = common_tokenize(vocab, word, /* add_special= */ false, /* parse_special= */ true); - if (ids.size() == 1) { - auto token = ids[0]; - if (std::find(params.sampling.preserved_tokens.begin(), params.sampling.preserved_tokens.end(), (llama_token) token) == params.sampling.preserved_tokens.end()) { - throw std::runtime_error("Grammar trigger word should be marked as preserved token: " + word); - } - SRV_DBG("Grammar trigger token: %d (`%s`)\n", token, word.c_str()); - common_grammar_trigger trigger; - trigger.type = COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN; - trigger.value = word; - trigger.token = token; - params.sampling.grammar_triggers.push_back(std::move(trigger)); - } else { - SRV_DBG("Grammar trigger word: `%s`\n", word.c_str()); - params.sampling.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, word}); - } - } else { - if (ct.value.type == COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN) { - SRV_DBG("Grammar trigger pattern: `%s`\n", ct.value.value.c_str()); - } else if (ct.value.type == COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL) { - SRV_DBG("Grammar trigger pattern full: `%s`\n", ct.value.value.c_str()); - } else { - throw std::runtime_error("Unknown grammar trigger type"); - } - params.sampling.grammar_triggers.emplace_back(std::move(ct.value)); - } - } - } - if (params.sampling.grammar_lazy && params.sampling.grammar_triggers.empty()) { - throw std::runtime_error("Error: no triggers set for lazy grammar!"); - } - } - - // Parse reasoning budget sampler parameters - { - const int32_t budget = json_value(data, "reasoning_budget_tokens", (int32_t) -1); - const auto start_tag = json_value(data, "reasoning_budget_start_tag", std::string()); - const auto end_tag = json_value(data, "reasoning_budget_end_tag", std::string()); - const auto message = json_value(data, "reasoning_budget_message", std::string()); - params.sampling.reasoning_budget_tokens = budget; - params.sampling.reasoning_control = json_value(data, "reasoning_control", false); - - if (!start_tag.empty()) { - params.sampling.reasoning_budget_start = common_tokenize(vocab, start_tag, false, true); - } - if (!end_tag.empty()) { - params.sampling.reasoning_budget_end = common_tokenize(vocab, end_tag, false, true); - params.sampling.reasoning_budget_forced = common_tokenize(vocab, message + end_tag, false, true); - - SRV_DBG("reasoning budget: tokens=%d, generation_prompt='%s', start=%zu toks, end=%zu toks, forced=%zu toks\n", - budget, params.sampling.generation_prompt.c_str(), - params.sampling.reasoning_budget_start.size(), - params.sampling.reasoning_budget_end.size(), - params.sampling.reasoning_budget_forced.size()); - } - } - - { - params.sampling.logit_bias.clear(); - - const auto & logit_bias = data.find("logit_bias"); - if (logit_bias != data.end() && logit_bias->is_array()) { - const int n_vocab = llama_vocab_n_tokens(vocab); - for (const auto & el : *logit_bias) { - // TODO: we may want to throw errors here, in case "el" is incorrect - if (el.is_array() && el.size() == 2) { - float bias; - if (el[1].is_number()) { - bias = el[1].get(); - } else if (el[1].is_boolean() && !el[1].get()) { - bias = -INFINITY; - } else { - continue; - } - - if (el[0].is_number_integer()) { - llama_token tok = el[0].get(); - if (tok >= 0 && tok < n_vocab) { - params.sampling.logit_bias.push_back({tok, bias}); - } - } else if (el[0].is_string()) { - auto toks = common_tokenize(vocab, el[0].get(), false); - for (auto tok : toks) { - params.sampling.logit_bias.push_back({tok, bias}); - } - } - } - } - } else if (logit_bias != data.end() && logit_bias->is_object()) { - const int n_vocab = llama_vocab_n_tokens(vocab); - for (const auto & el : logit_bias->items()) { - float bias; - const auto & key = el.key(); - const auto & value = el.value(); - if (value.is_number()) { - bias = value.get(); - } else if (value.is_boolean() && !value.get()) { - bias = -INFINITY; - } else { - continue; - } - - char *end; - llama_token tok = strtol(key.c_str(), &end, 10); - if (*end == 0) { - if (tok >= 0 && tok < n_vocab) { - params.sampling.logit_bias.push_back({tok, bias}); - } - } else { - auto toks = common_tokenize(vocab, key, false); - for (auto tok : toks) { - params.sampling.logit_bias.push_back({tok, bias}); - } - } - } - } - - params.sampling.ignore_eos = json_value(data, "ignore_eos", params_base.sampling.ignore_eos); - if (params.sampling.ignore_eos) { - params.sampling.logit_bias.insert( - params.sampling.logit_bias.end(), - logit_bias_eog.begin(), logit_bias_eog.end()); - } - } - - { - params.antiprompt.clear(); - - const auto & stop = data.find("stop"); - if (stop != data.end() && stop->is_array()) { - for (const auto & word : *stop) { - if (!word.empty()) { - params.antiprompt.push_back(word); - } - } - } - // set reverse prompt from cli args if not set in the request - if (params.antiprompt.empty()) { - params.antiprompt = defaults.antiprompt; - } - } - - { - const auto samplers = data.find("samplers"); - if (samplers != data.end()) { - if (samplers->is_array()) { - params.sampling.samplers = common_sampler_types_from_names(*samplers); - } else if (samplers->is_string()){ - params.sampling.samplers = common_sampler_types_from_chars(samplers->get()); - } - } else { - params.sampling.samplers = defaults.sampling.samplers; - } - } - - if (params.n_cmpl > params_base.n_parallel) { - throw std::runtime_error("n_cmpl cannot be greater than the number of slots, please increase -np"); - } - return params; -} - -// // result_timings // @@ -979,10 +591,11 @@ json server_task_result_cmpl_final::to_json_oaicompat_resp() { for (const common_chat_tool_call & tool_call : oaicompat_msg.tool_calls) { output.push_back(json { + {"id", "fc_" + tool_call.id}, {"type", "function_call"}, {"status", "completed"}, {"arguments", tool_call.arguments}, - {"call_id", "fc_" + tool_call.id}, + {"call_id", "call_" + tool_call.id}, {"name", tool_call.name}, }); } @@ -1078,10 +691,11 @@ json server_task_result_cmpl_final::to_json_oaicompat_resp_stream() { for (const common_chat_tool_call & tool_call : oaicompat_msg.tool_calls) { const json output_item = { + {"id", "fc_" + tool_call.id}, {"type", "function_call"}, {"status", "completed"}, {"arguments", tool_call.arguments}, - {"call_id", "fc_" + tool_call.id}, + {"call_id", "call_" + tool_call.id}, {"name", tool_call.name} }; server_sent_events.push_back(json { @@ -1665,8 +1279,9 @@ json server_task_result_cmpl_partial::to_json_oaicompat_resp() { {"data", json { {"type", "response.output_item.added"}, {"item", json { + {"id", "fc_" + diff.tool_call_delta.id}, {"arguments", ""}, - {"call_id", "fc_" + diff.tool_call_delta.id}, + {"call_id", "call_" + diff.tool_call_delta.id}, {"name", diff.tool_call_delta.name}, {"type", "function_call"}, {"status", "in_progress"}, diff --git a/tools/server/server-task.h b/tools/server/server-task.h index bdadcff76527..293bdf053abf 100644 --- a/tools/server/server-task.h +++ b/tools/server/server-task.h @@ -62,9 +62,6 @@ struct task_params { int32_t n_cache_reuse = 0; // min chunk size to attempt reusing from the cache via KV shifting (0 = disabled) - // number of prompt tokens before the latest user message - int32_t n_before_user = -1; - int64_t t_max_prompt_ms = -1; // TODO: implement int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit @@ -92,6 +89,9 @@ struct task_params { // per-request parameters for chat parsing common_chat_parser_params chat_parser_params; + // message spans for checkpointing + common_chat_msg_spans message_spans; + // Embeddings int32_t embd_normalize = 2; // (-1=none, 0=max absolute int16, 1=taxicab, 2=Euclidean/L2, >2=p-norm) @@ -210,13 +210,6 @@ struct server_task { } } - static task_params params_from_json_cmpl( - const llama_vocab * vocab, - const common_params & params_base, - const int n_ctx_slot, - const std::vector & logit_bias_eog, - const json & data); - // utility function static std::unordered_set get_list_id(const std::vector & tasks) { std::unordered_set ids(tasks.size()); @@ -312,6 +305,9 @@ struct server_task_result { } virtual json to_json() = 0; virtual ~server_task_result() = default; + virtual server_task_result * clone() const { + GGML_ABORT("not implemented for this task type"); + } }; // using shared_ptr for polymorphism of server_task_result @@ -649,3 +645,12 @@ struct server_prompt_cache { void update(); }; + +// used exclusively by router mode +struct server_task_result_router : server_task_result { + json data; + virtual json to_json() override { return data; } + virtual server_task_result * clone() const override { + return new server_task_result_router(*this); + } +}; diff --git a/tools/server/server-tools.cpp b/tools/server/server-tools.cpp index 97433fe4b500..790ed85a0641 100644 --- a/tools/server/server-tools.cpp +++ b/tools/server/server-tools.cpp @@ -11,6 +11,7 @@ #include #include #include +#include namespace fs = std::filesystem; @@ -568,9 +569,13 @@ struct server_tool_edit_file : server_tool { } int n = (int) lines.size(); if (e.line_start == -1) { - // -1 means end of file; line_end is ignored — normalize to point past last line - e.line_start = n + 1; - e.line_end = n + 1; + // -1 targets end of file -> valid for append only; line_end is ignored + if (e.mode != "append") { + return {{"error", "line_start -1 (end of file) is only valid for append mode"}}; + } + // append at end of file: insert position is the current line count + e.line_start = n; + e.line_end = n; } else { if (e.line_start < 1 || e.line_end < e.line_start) { return {{"error", string_format("invalid line range [%d, %d]", e.line_start, e.line_end)}}; @@ -611,8 +616,8 @@ struct server_tool_edit_file : server_tool { } else if (e.mode == "delete") { lines.erase(lines.begin() + idx_start, lines.begin() + idx_end + 1); } else { // append - // idx_end + 1 may equal lines.size() when line_start == -1 (end of file) - lines.insert(lines.begin() + idx_end + 1, new_lines.begin(), new_lines.end()); + // insert after idx_end; idx_end + 1 == lines.size() for end-of-file append + lines.insert(lines.begin() + (idx_end + 1), new_lines.begin(), new_lines.end()); } } diff --git a/tools/server/server.cpp b/tools/server/server.cpp index a6ea749d0c3f..1bbc99d89023 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -2,6 +2,7 @@ #include "server-http.h" #include "server-models.h" #include "server-cors-proxy.h" +#include "server-stream.h" #include "server-tools.h" #include "arg.h" @@ -82,6 +83,10 @@ int llama_server(int argc, char ** argv) { common_init(); + // start the stream session manager GC right after common init, before any HTTP route can + // touch it. lifecycle is symmetric, stop_gc() runs in clean_up() before backend free + g_stream_sessions.start_gc(); + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SERVER)) { return 1; } @@ -89,30 +94,47 @@ int llama_server(int argc, char ** argv) { llama_backend_init(); llama_numa_init(params.numa); + common_models_handler models_handler; + try { + models_handler = common_models_handler_init(params, LLAMA_EXAMPLE_SERVER); + if (common_models_handler_is_preset_repo(models_handler)) { + // apply the preset and start the server in router mode + common_models_handler_apply(models_handler, params); + } + } catch (const std::exception & e) { + SRV_ERR("failed to fetch model metadata: %s\n", e.what()); + return 1; + } + // router server never loads a model and must not touch the GPU + const bool is_router_server = params.model.path.empty() + && params.model.hf_repo.empty(); + // skip device enumeration so the CUDA primary context stays uncreated - const bool is_router_server = params.model.path.empty(); common_params_print_info(params, !is_router_server); - // validate batch size for embeddings - // embeddings require all tokens to be processed in a single ubatch - // see https://github.com/ggml-org/llama.cpp/issues/12836 - if (params.embedding && params.n_batch > params.n_ubatch) { - SRV_WRN("embeddings enabled with n_batch (%d) > n_ubatch (%d)\n", params.n_batch, params.n_ubatch); - SRV_WRN("setting n_batch = n_ubatch = %d to avoid assertion failure\n", params.n_ubatch); - params.n_batch = params.n_ubatch; - } + if (!is_router_server) { + // validate batch size for embeddings + // embeddings require all tokens to be processed in a single ubatch + // see https://github.com/ggml-org/llama.cpp/issues/12836 + if (params.embedding && params.n_batch > params.n_ubatch) { + SRV_WRN("embeddings enabled with n_batch (%d) > n_ubatch (%d)\n", params.n_batch, params.n_ubatch); + SRV_WRN("setting n_batch = n_ubatch = %d to avoid assertion failure\n", params.n_ubatch); + params.n_batch = params.n_ubatch; + } - if (params.n_parallel < 0) { - SRV_INF("%s", "n_parallel is set to auto, using n_parallel = 4 and kv_unified = true\n"); + if (params.n_parallel < 0) { + SRV_INF("%s", "n_parallel is set to auto, using n_parallel = 4 and kv_unified = true\n"); - params.n_parallel = 4; - params.kv_unified = true; + params.n_parallel = 4; + params.kv_unified = true; + } } // for consistency between server router mode and single-model mode, we set the same model name as alias - if (params.model_alias.empty() && !params.model.name.empty()) { - params.model_alias.insert(params.model.name); + auto model_name = params.model.get_name(); + if (params.model_alias.empty() && !model_name.empty()) { + params.model_alias.insert(model_name); } // struct that contains llama context and inference @@ -129,6 +151,7 @@ int llama_server(int argc, char ** argv) { // // register API routes + server_child child; // only used in non-router mode server_routes routes(params, ctx_server); server_tools tools; @@ -172,8 +195,11 @@ int llama_server(int argc, char ** argv) { routes.get_props = models_routes->get_router_props; routes.get_models = models_routes->get_router_models; + ctx_http.post("/models", ex_wrapper(models_routes->post_router_models)); ctx_http.post("/models/load", ex_wrapper(models_routes->post_router_models_load)); ctx_http.post("/models/unload", ex_wrapper(models_routes->post_router_models_unload)); + ctx_http.get ("/models/sse", ex_wrapper(models_routes->get_router_models_sse)); + ctx_http.del ("/models", ex_wrapper(models_routes->del_router_models)); } ctx_http.get ("/health", ex_wrapper(routes.get_health)); // public endpoint (no API key check) @@ -218,19 +244,58 @@ int llama_server(int argc, char ** argv) { ctx_http.get ("/slots", ex_wrapper(routes.get_slots)); ctx_http.post("/slots/:id_slot", ex_wrapper(routes.post_slots)); + // resumable streaming, the conversation_id is the session identity end to end. router and + // child wire different handlers under the same paths: a child binds the local g_stream_sessions + // backed factories, the router binds proxies that resolve the owning child through the + // conv_id -> model map + server_http_context::handler_t stream_get_h; + server_http_context::handler_t streams_lookup_h; + server_http_context::handler_t stream_delete_h; + if (is_router_server) { + stream_get_h = models_routes->router_stream_get; + streams_lookup_h = models_routes->router_streams_lookup; + stream_delete_h = models_routes->router_stream_delete; + } else { + stream_get_h = make_stream_get_handler(); + streams_lookup_h = make_streams_lookup_handler(); + stream_delete_h = make_stream_delete_handler(); + } + ctx_http.get ("/v1/stream/:conv_id", ex_wrapper(stream_get_h)); + // POST /v1/streams/lookup with body {"conversation_ids": [...]}. you can only ask for ids + // you already own (the WebUI passes the convs visible in its sidebar). the server never + // lists ids it has not been asked about, so a random caller cannot enumerate live sessions + ctx_http.post("/v1/streams/lookup", ex_wrapper(streams_lookup_h)); + ctx_http.del ("/v1/stream/:conv_id", ex_wrapper(stream_delete_h)); + // Google Cloud Platform (Vertex AI) compat ctx_http.register_gcp_compat(); + // return 403 for disabled features + server_http_context::handler_t res_403 = [](const server_http_req &) { + auto res = std::make_unique(); + res->status = 403; + res->data = safe_json_to_str({ + {"error", { + {"message", "this feature is disabled"}, + {"type", "feature_disabled"}, + }} + }); + return res; + }; + // CORS proxy (EXPERIMENTAL, only used by the Web UI for MCP) - // Supports both new ui_mcp_proxy and deprecated webui_mcp_proxy fields - if (params.ui_mcp_proxy || params.webui_mcp_proxy) { + if (params.ui_mcp_proxy) { SRV_WRN("%s", "-----------------\n"); SRV_WRN("%s", "CORS proxy is enabled, do not expose server to untrusted environments\n"); SRV_WRN("%s", "This feature is EXPERIMENTAL and may be removed or changed in future versions\n"); SRV_WRN("%s", "-----------------\n"); ctx_http.get ("/cors-proxy", ex_wrapper(proxy_handler_get)); ctx_http.post("/cors-proxy", ex_wrapper(proxy_handler_post)); + } else { + ctx_http.get ("/cors-proxy", ex_wrapper(res_403)); + ctx_http.post("/cors-proxy", ex_wrapper(res_403)); } + // EXPERIMENTAL built-in tools if (!params.server_tools.empty()) { try { @@ -245,6 +310,25 @@ int llama_server(int argc, char ** argv) { SRV_WRN("%s", "-----------------\n"); ctx_http.get ("/tools", ex_wrapper(tools.handle_get)); ctx_http.post("/tools", ex_wrapper(tools.handle_post)); + } else { + ctx_http.get ("/tools", ex_wrapper(res_403)); + ctx_http.post("/tools", ex_wrapper(res_403)); + } + + // + // Handle downloading model + // + + if (child.is_child() && child.get_mode() == SERVER_CHILD_MODE_DOWNLOAD) { + return child.run_download(params); + } else if (!is_router_server) { + // single-model mode (NOT spawned by router) + try { + common_models_handler_apply(models_handler, params); + } catch (const std::exception & e) { + SRV_ERR("failed to download model: %s\n", e.what()); + return 1; + } } // @@ -258,7 +342,10 @@ int llama_server(int argc, char ** argv) { clean_up = [&models_routes]() { SRV_INF("%s: cleaning up before exit...\n", __func__); + // stop the session GC first, it finalizes live sessions and wakes pending readers + g_stream_sessions.stop_gc(); if (models_routes.has_value()) { + models_routes->stopping.store(true); // maybe redundant, but just to be safe models_routes->models.unload_all(); } llama_backend_free(); @@ -272,6 +359,10 @@ int llama_server(int argc, char ** argv) { ctx_http.is_ready.store(true); shutdown_handler = [&](int) { + if (models_routes.has_value()) { + // important to disconnect any SSE clients + models_routes->stopping.store(true); + } ctx_http.stop(); }; @@ -279,6 +370,8 @@ int llama_server(int argc, char ** argv) { // setup clean up function, to be called before exit clean_up = [&ctx_http, &ctx_server]() { SRV_INF("%s: cleaning up before exit...\n", __func__); + // stop the session GC first, it finalizes live sessions and wakes pending readers + g_stream_sessions.stop_gc(); ctx_http.stop(); ctx_server.terminate(); llama_backend_free(); @@ -291,15 +384,16 @@ int llama_server(int argc, char ** argv) { return 1; } - // load the model - SRV_INF("%s", "loading model\n"); - - if (server_models::is_child_server()) { - ctx_server.on_sleeping_changed([&](bool sleeping) { - server_models::notify_router_sleeping_state(sleeping); + // setup communication child --> router if necessary + if (child.is_child()) { + ctx_server.set_state_callback([&](server_state state, json payload) { + child.notify_to_router(server_state_to_str(state), payload); }); } + // load the model + SRV_INF("%s", "loading model\n"); + if (!ctx_server.load_model(params)) { clean_up(); if (ctx_http.thread.joinable()) { @@ -339,6 +433,12 @@ int llama_server(int argc, char ** argv) { SRV_INF("router server is listening on %s\n", ctx_http.listening_address.c_str()); SRV_WRN("%s", "NOTE: router mode is experimental\n"); SRV_WRN("%s", " it is not recommended to use this mode in untrusted environments\n"); + + if (!params.models_preset_hf.empty()) { + SRV_WRN( "NOTE: using preset.ini from HF repo '%s'\n", params.models_preset_hf.c_str()); + SRV_WRN("%s", " please only use presets that you can trust! Unknown presets may be unsafe\n"); + } + if (ctx_http.thread.joinable()) { ctx_http.thread.join(); // keep the main thread alive } @@ -350,9 +450,9 @@ int llama_server(int argc, char ** argv) { // optionally, notify router server that this instance is ready std::thread monitor_thread; - if (server_models::is_child_server()) { - json model_info = routes.get_model_info(); - monitor_thread = server_models::setup_child_server(shutdown_handler, model_info); + if (child.is_child()) { + monitor_thread = child.setup(shutdown_handler); + child.notify_to_router(server_state_to_str(SERVER_STATE_READY), routes.get_model_info()); } // this call blocks the main thread until queue_tasks.terminate() is called diff --git a/tools/server/tests/unit/test_basic.py b/tools/server/tests/unit/test_basic.py index d1b89cf1a91c..285726abf406 100644 --- a/tools/server/tests/unit/test_basic.py +++ b/tools/server/tests/unit/test_basic.py @@ -79,9 +79,9 @@ def test_load_split_model(): assert match_regex("(little|girl)+", res.body["content"]) -def test_no_webui(): +def test_no_ui(): global server - # default: webui enabled + # default: UI enabled server.start() url = f"http://{server.server_host}:{server.server_port}" res = requests.get(url) @@ -89,8 +89,8 @@ def test_no_webui(): assert "" in res.text server.stop() - # with --no-webui - server.no_webui = True + # with --no-ui, the UI should be disabled + server.no_ui = True server.start() res = requests.get(url) assert res.status_code == 404 diff --git a/tools/server/tests/unit/test_chat_completion.py b/tools/server/tests/unit/test_chat_completion.py index fe55dc5ab179..0258b539ed87 100644 --- a/tools/server/tests/unit/test_chat_completion.py +++ b/tools/server/tests/unit/test_chat_completion.py @@ -307,6 +307,20 @@ def test_completion_with_grammar(jinja: bool, grammar: str, n_predicted: int, re assert match_regex(re_content, choice["message"]["content"]), choice["message"]["content"] +def test_completion_with_invalid_grammar(): + global server + server.start() + res = server.make_request("POST", "/chat/completions", data={ + "max_tokens": 8, + "messages": [ + {"role": "user", "content": "Does not matter what I say, does it?"}, + ], + "grammar": "root ::= this is (not valid GBNF", + }) + assert res.status_code == 400, res.body + assert "error" in res.body + + @pytest.mark.parametrize("messages", [ None, "string", @@ -589,3 +603,23 @@ def test_chat_completions_token_count(): }) assert res.status_code == 200 assert res.body["input_tokens"] > 5 + + +def test_verbose_debug(): + global server + server.start() + for verbose in [True, False]: + res = server.make_request("POST", "/chat/completions", data={ + "max_tokens": 2, + "messages": [ + {"role": "system", "content": "Book"}, + {"role": "user", "content": "What is the best book"}, + ], + "verbose": verbose, + }) + assert res.status_code == 200 + if verbose: + assert "__verbose" in res.body + assert "Book" in res.body["__verbose"]["prompt"] + else: + assert "__verbose" not in res.body diff --git a/tools/server/tests/unit/test_proxy.py b/tools/server/tests/unit/test_proxy.py index b7c3326187b5..0fed536e59af 100644 --- a/tools/server/tests/unit/test_proxy.py +++ b/tools/server/tests/unit/test_proxy.py @@ -12,16 +12,16 @@ def create_server(): def test_mcp_no_proxy(): global server - server.webui_mcp_proxy = False + server.ui_mcp_proxy = False server.start() res = server.make_request("GET", "/cors-proxy") - assert res.status_code == 404 + assert res.status_code == 403 def test_mcp_proxy(): global server - server.webui_mcp_proxy = True + server.ui_mcp_proxy = True server.start() url = f"http://{server.server_host}:{server.server_port}/cors-proxy?url=http://example.com" @@ -32,7 +32,7 @@ def test_mcp_proxy(): def test_mcp_proxy_custom_port(): global server - server.webui_mcp_proxy = True + server.ui_mcp_proxy = True server.start() # try getting the server's models API via the proxy diff --git a/tools/server/tests/unit/test_router.py b/tools/server/tests/unit/test_router.py index c93b92b0b2ee..94165e520e6f 100644 --- a/tools/server/tests/unit/test_router.py +++ b/tools/server/tests/unit/test_router.py @@ -1,3 +1,4 @@ +import threading import pytest from utils import * @@ -253,3 +254,138 @@ def test_router_reload_models(): assert "model-reload-c" in ids, "newly added model should appear" finally: os.remove(preset_path) + + +def test_router_remote_preset(): + global server + server.model_hf_repo = "ggml-org/test-preset-ci" + server.model_hf_file = None + server.offline = False + server.start() + + # Should see preset models in GET /models + res = server.make_request("GET", "/models") + assert res.status_code == 200 + ids = {item["id"] for item in res.body.get("data", [])} + assert "tinygemma3-preset" in ids + assert "stories260K-test" in ids + + # Should be able to load a preset model + model_id = "tinygemma3-preset" + _load_model_and_wait(model_id) + + +MODEL_DOWNLOAD_ID = "ggml-org/test-model-router-download:F16" +MODEL_DOWNLOAD_TIMEOUT = 30 + + +def _listen_sse( + server: ServerProcess, collected: list, stop: threading.Event, ready: threading.Event | None = None +): + """Collect /models/sse events into `collected` until `stop` is set. + + When `ready` is provided, it is set once the streaming response is open, + i.e. the server has accepted the connection and registered us as a + subscriber. Callers that trigger one-shot events (e.g. download_finished) + must wait on `ready` before acting, otherwise the event can be broadcast + before this client is subscribed and be lost. + """ + url = f"http://{server.server_host}:{server.server_port}/models/sse" + try: + with requests.get(url, stream=True, timeout=MODEL_DOWNLOAD_TIMEOUT) as resp: + if ready is not None: + ready.set() + for line_bytes in resp.iter_lines(): + if stop.is_set(): + break + line = line_bytes.decode("utf-8") + if line.startswith("data: "): + collected.append(json.loads(line[6:])) + except Exception: + pass + + +def _wait_for_sse_event(collected: list, event_type: str, model: str, timeout: int) -> bool: + deadline = time.time() + timeout + while time.time() < deadline: + if any(e.get("event") == event_type and e.get("model") == model for e in collected): + return True + time.sleep(0.5) + return False + + +def test_router_download_model(): + """Case 1: download a model, verify SSE events and GET /models.""" + global server + server.start() + + # Ensure the model is not present before we start + server.make_request("DELETE", f"/models?model={MODEL_DOWNLOAD_ID}") + + sse_events: list = [] + stop = threading.Event() + sse_ready = threading.Event() + sse_thread = threading.Thread( + target=_listen_sse, args=(server, sse_events, stop, sse_ready), daemon=True + ) + sse_thread.start() + + # wait for the SSE client to be subscribed before triggering the download, + # otherwise the one-shot download_finished event can be broadcast before + # this client is registered and be lost + assert sse_ready.wait(10), "SSE client failed to connect" + + # Trigger the download + res = server.make_request("POST", "/models", data={"model": MODEL_DOWNLOAD_ID}) + assert res.status_code == 200 + assert res.body.get("success") is True + + # Wait for download_finished SSE event + finished = _wait_for_sse_event( + sse_events, "download_finished", MODEL_DOWNLOAD_ID, MODEL_DOWNLOAD_TIMEOUT + ) + stop.set() + + assert finished, "Never received download_finished SSE event" + assert any( + e.get("event") == "download_progress" and e.get("model") == MODEL_DOWNLOAD_ID + for e in sse_events + ), "No download_progress events received" + + # Model should now appear in GET /models + ids = _get_model_ids(is_reload=False) + assert MODEL_DOWNLOAD_ID in ids, f"{MODEL_DOWNLOAD_ID} not found in /models after download" + + +def test_router_delete_model(): + """Case 2: delete the downloaded model, verify it disappears from GET /models.""" + global server + server.start() + + # Ensure the model exists (download it if needed) + if MODEL_DOWNLOAD_ID not in _get_model_ids(is_reload=False): + sse_events: list = [] + stop = threading.Event() + sse_ready = threading.Event() + threading.Thread( + target=_listen_sse, args=(server, sse_events, stop, sse_ready), daemon=True + ).start() + # subscribe before triggering the download so the one-shot + # download_finished event is not lost (see test_router_download_model) + assert sse_ready.wait(10), "SSE client failed to connect" + res = server.make_request("POST", "/models", data={"model": MODEL_DOWNLOAD_ID}) + assert res.status_code == 200 + finished = _wait_for_sse_event( + sse_events, "download_finished", MODEL_DOWNLOAD_ID, MODEL_DOWNLOAD_TIMEOUT + ) + stop.set() + assert finished, "Model did not finish downloading before delete test" + + # Delete the model + del_res = server.make_request("DELETE", f"/models?model={MODEL_DOWNLOAD_ID}") + assert del_res.status_code == 200 + assert del_res.body.get("success") is True + + # Model should no longer appear in GET /models + ids = _get_model_ids(is_reload=False) + assert MODEL_DOWNLOAD_ID not in ids, f"{MODEL_DOWNLOAD_ID} still present after deletion" diff --git a/tools/server/tests/unit/test_security.py b/tools/server/tests/unit/test_security.py index bb22095f125c..a0c3e214ae3e 100644 --- a/tools/server/tests/unit/test_security.py +++ b/tools/server/tests/unit/test_security.py @@ -1,6 +1,8 @@ import pytest from openai import OpenAI from utils import * +import threading +from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer server = ServerPreset.tinyllama2() @@ -26,7 +28,7 @@ def test_access_static_assets_without_api_key(): """Static web UI assets should not require API key authentication (issue #21229)""" global server server.start() - for path in ["/", "/bundle.js", "/bundle.css"]: + for path in ["/", "/sw.js", "/manifest.webmanifest", "/_app/version.json"]: res = server.make_request("GET", path) assert res.status_code == 200, f"Expected 200 for {path}, got {res.status_code}" @@ -105,6 +107,49 @@ def test_cors_options(origin: str, cors_header: str, cors_header_value: str): assert res.headers[cors_header] == cors_header_value +def test_cors_proxy_only_forwards_explicit_proxy_headers(): + class CaptureHeadersHandler(BaseHTTPRequestHandler): + def do_GET(self): + self.server.captured_headers = dict(self.headers) + self.send_response(200) + self.end_headers() + self.wfile.write(b"ok") + + def log_message(self, format, *args): + pass + + target = ThreadingHTTPServer(("127.0.0.1", 0), CaptureHeadersHandler) + target.captured_headers = {} + target_thread = threading.Thread(target=target.serve_forever, daemon=True) + target_thread.start() + + try: + server = ServerPreset.tinyllama2() + server.api_key = TEST_API_KEY + server.ui_mcp_proxy = True + server.start() + + res = server.make_request("GET", f"/cors-proxy?url=http://127.0.0.1:{target.server_port}/capture", headers={ + "Authorization": f"Bearer {TEST_API_KEY}", + "Proxy-Authorization": "Basic secret", + "X-Api-Key": TEST_API_KEY, + "Cookie": "session=secret", + "x-llama-server-proxy-header-accept": "application/json", + "x-llama-server-proxy-header-authorization": "Bearer explicit", + }) + + assert res.status_code == 200 + captured = {key.lower(): value for key, value in target.captured_headers.items()} + assert captured["accept"] == "application/json" + assert captured["authorization"] == "Bearer explicit" + assert "proxy-authorization" not in captured + assert "x-api-key" not in captured + assert "cookie" not in captured + finally: + target.shutdown() + target.server_close() + + @pytest.mark.parametrize( "media_path, image_url, success", [ diff --git a/tools/server/tests/utils.py b/tools/server/tests/utils.py index c5dba1c139fc..63a959449eb6 100644 --- a/tools/server/tests/utils.py +++ b/tools/server/tests/utils.py @@ -94,7 +94,7 @@ class ServerProcess: enable_ctx_shift: int | None = False spec_draft_n_min: int | None = None spec_draft_n_max: int | None = None - no_webui: bool | None = None + no_ui: bool | None = None jinja: bool | None = None reasoning_format: Literal['deepseek', 'none', 'nothink'] | None = None reasoning: Literal['on', 'off', 'auto'] | None = None @@ -107,7 +107,7 @@ class ServerProcess: cache_ram: int | None = None no_cache_idle_slots: bool = False log_path: str | None = None - webui_mcp_proxy: bool = False + ui_mcp_proxy: bool = False backend_sampling: bool = False gcp_compat: bool = False @@ -225,8 +225,8 @@ def start(self, timeout_seconds: int = DEFAULT_HTTP_TIMEOUT) -> None: server_args.extend(["--spec-draft-n-max", self.spec_draft_n_max]) if self.spec_draft_n_min: server_args.extend(["--spec-draft-n-min", self.spec_draft_n_min]) - if self.no_webui: - server_args.append("--no-webui") + if self.no_ui: + server_args.append("--no-ui") if self.no_models_autoload: server_args.append("--no-models-autoload") if self.jinja: @@ -251,8 +251,8 @@ def start(self, timeout_seconds: int = DEFAULT_HTTP_TIMEOUT) -> None: server_args.extend(["--cache-ram", self.cache_ram]) if self.no_cache_idle_slots: server_args.append("--no-cache-idle-slots") - if self.webui_mcp_proxy: - server_args.append("--webui-mcp-proxy") + if self.ui_mcp_proxy: + server_args.append("--ui-mcp-proxy") if self.backend_sampling: server_args.append("--backend_sampling") if self.gcp_compat: @@ -340,6 +340,9 @@ def make_request( elif method == "POST": response = requests.post(url, headers=headers, json=data, timeout=timeout) parse_body = True + elif method == "DELETE": + response = requests.delete(url, headers=headers, timeout=timeout) + parse_body = True elif method == "OPTIONS": response = requests.options(url, headers=headers, timeout=timeout) else: diff --git a/tools/ui/.gitignore b/tools/ui/.gitignore index 22ed6125f4ca..0bb8c9b3c218 100644 --- a/tools/ui/.gitignore +++ b/tools/ui/.gitignore @@ -8,6 +8,8 @@ node_modules .wrangler /.svelte-kit /build +dev-dist +dist # OS .DS_Store @@ -23,6 +25,14 @@ Thumbs.db vite.config.js.timestamp-* vite.config.ts.timestamp-* +# PWA Artifacts +apple-splash-*.png +apple-touch-icon-*.png +maskable-icon-*.png +pwa-*.png +static/favicon* + +# Storybook *storybook.log storybook-static *.code-workspace diff --git a/tools/ui/CMakeLists.txt b/tools/ui/CMakeLists.txt index 60d9020da389..74bca417e372 100644 --- a/tools/ui/CMakeLists.txt +++ b/tools/ui/CMakeLists.txt @@ -1,6 +1,7 @@ set(TARGET llama-ui) -set(LLAMA_UI_HF_BUCKET "llama-ui" CACHE STRING "Hugging Face bucket name for prebuilt UI assets") +set(LLAMA_UI_HF_BUCKET "ggml-org/llama-ui" CACHE STRING "Hugging Face bucket name for prebuilt UI assets") +set(LLAMA_UI_GZIP ON CACHE BOOL "Apply gzip compress to assets to save bandwidth") # Backward compat: forward old var to new one if(DEFINED LLAMA_BUILD_WEBUI) @@ -77,11 +78,13 @@ add_custom_target(llama-ui-assets ALL "-DUI_SOURCE_DIR=${CMAKE_CURRENT_SOURCE_DIR}" "-DUI_BINARY_DIR=${CMAKE_CURRENT_BINARY_DIR}" "-DLLAMA_SOURCE_DIR=${PROJECT_SOURCE_DIR}" + "-DLLAMA_BUILD_NUMBER=${LLAMA_BUILD_NUMBER}" "-DHF_BUCKET=${LLAMA_UI_HF_BUCKET}" "-DHF_VERSION=${HF_UI_VERSION}" "-DHF_ENABLED=${LLAMA_USE_PREBUILT_UI}" "-DBUILD_UI=${LLAMA_BUILD_UI}" "-DLLAMA_UI_EMBED=${LLAMA_UI_EMBED_EXE}" + "-DLLAMA_UI_GZIP=${LLAMA_UI_GZIP}" -P "${PROJECT_SOURCE_DIR}/scripts/ui-assets.cmake" COMMENT "Provisioning UI assets" VERBATIM diff --git a/tools/ui/embed.cpp b/tools/ui/embed.cpp index f603b8569c87..cdbb64232367 100644 --- a/tools/ui/embed.cpp +++ b/tools/ui/embed.cpp @@ -1,16 +1,44 @@ // llama-ui-embed: generate ui.cpp / ui.h that embed UI assets as C arrays. // // Usage: -// llama-ui-embed [ ]... +// llama-ui-embed [] +// +// Recursively embeds every regular file under . +// Asset names are relative paths from (e.g. "_app/immutable/bundle.HASH.js"). +// Without , emits an empty asset table. +#include #include +#include #include #include + +#include +#include #include +#include #include #include -#include -#include + + +static const char * mime_from_ext(const std::string & name) { + auto ext = name.rfind('.'); + if (ext == std::string::npos) return "application/octet-stream"; + std::string e = name.substr(ext + 1); + if (e == "html") return "text/html; charset=utf-8"; + if (e == "css") return "text/css"; + if (e == "js") return "application/javascript"; + if (e == "json") return "application/json"; + if (e == "webmanifest") return "application/manifest+json"; + if (e == "svg") return "image/svg+xml"; + if (e == "png") return "image/png"; + if (e == "jpg" || + e == "jpeg") return "image/jpeg"; + if (e == "ico") return "image/x-icon"; + if (e == "woff") return "font/woff"; + if (e == "woff2") return "font/woff2"; + return "application/octet-stream"; +} // Computes FNV-1a hash of the data static uint64_t fnv_hash(const uint8_t * data, size_t len) { @@ -24,10 +52,10 @@ static uint64_t fnv_hash(const uint8_t * data, size_t len) { return hash; } -static bool read_file(const std::string & path, std::vector & out) { +static bool read_file(const std::filesystem::path & path, std::vector & out) { std::ifstream f(path, std::ios::binary | std::ios::ate); if (!f) { - fprintf(stderr, "embed: cannot open %s\n", path.c_str()); + fprintf(stderr, "embed: cannot open %s\n", path.string().c_str()); return false; } const auto sz = f.tellg(); @@ -77,7 +105,24 @@ static bool write_if_different(const std::string & path, const std::string & con if (!content.empty()) { out.write(content.data(), static_cast(content.size())); } - return out.good(); + bool ok = out.good(); + if (ok) { + printf("embed: write output file %s\n", path.c_str()); + } + return ok; +} + +static std::string path_basename(const std::string & name) { + const size_t p = name.rfind('/'); + return p == std::string::npos ? name : name.substr(p + 1); +} +static bool str_starts_with(const std::string & s, const char * prefix) { + const size_t n = strlen(prefix); + return s.size() >= n && s.compare(0, n, prefix) == 0; +} +static bool str_ends_with(const std::string & s, const char * suffix) { + const size_t n = strlen(suffix); + return s.size() >= n && s.compare(s.size() - n, n, suffix) == 0; } static std::string fmt(const char * pattern, ...) { @@ -89,72 +134,171 @@ static std::string fmt(const char * pattern, ...) { return (n > 0) ? std::string(tmp, static_cast(n)) : std::string(); } +struct asset_entry { + std::string name; + std::filesystem::path path; +}; + int main(int argc, char ** argv) { - if (argc < 3 || ((argc - 3) % 2) != 0) { - fprintf(stderr, "usage: %s [ ]...\n", argv[0]); + if (argc < 3 || argc > 4) { + fprintf(stderr, "usage: %s []\n", argv[0]); return 1; } - const std::string out_cpp = argv[1]; - const std::string out_h = argv[2]; - const int n_assets = (argc - 3) / 2; + const std::string out_cpp = argv[1]; + const std::string out_h = argv[2]; + const std::string asset_dir = (argc >= 4) ? argv[3] : std::string(); + + const bool use_gzip = !asset_dir.empty() && std::filesystem::exists(asset_dir + "/_gzip"); + const std::string in_dir = use_gzip ? (asset_dir + "/_gzip") : asset_dir; + + std::vector assets; + if (!in_dir.empty()) { + const std::filesystem::path dir = in_dir; + + std::error_code ec; + std::filesystem::recursive_directory_iterator it(dir, ec); + if (ec) { + fprintf(stderr, "embed: cannot iterate %s: %s\n", argv[3], ec.message().c_str()); + return 1; + } + for (const auto & entry : it) { + if (!entry.is_regular_file()) { + continue; + } + // name is the relative path from dir, with forward slashes + const std::string name = entry.path().lexically_relative(dir).generic_string(); + assets.push_back({ name, entry.path() }); + } + + // directory iteration order is unspecified; sort for reproducible output + std::sort(assets.begin(), assets.end(), + [](const asset_entry & a, const asset_entry & b) { return a.name < b.name; }); + } + + const int n_assets = static_cast(assets.size()); + + if (n_assets > 0) { + using match_fn = std::function; + auto exact = [](const char * name) -> match_fn { + return [name](const std::string & base) { return base == name; }; + }; + + struct required_check { const char * label; match_fn match; bool found; }; + required_check checks[] = { + { "index.html", exact("index.html"), false }, + { "loading.html", exact("loading.html"), false }, + { "manifest.webmanifest", exact("manifest.webmanifest"), false }, + { "sw.js", exact("sw.js"), false }, + { "build.json", exact("build.json"), false }, + { "version.json", exact("version.json"), false }, + { "bundle[hash].js", [](const std::string & b) { + return str_starts_with(b, "bundle") && str_ends_with(b, ".js"); + }, false }, + { "bundle[hash].css", [](const std::string & b) { + return str_starts_with(b, "bundle") && str_ends_with(b, ".css"); + }, false }, + { "workbox[hash].js", [](const std::string & b) { + return str_starts_with(b, "workbox") && str_ends_with(b, ".js"); + }, false }, + }; + + for (const auto & a : assets) { + const std::string base = path_basename(a.name); + for (auto & c : checks) { + if (!c.found) { c.found = c.match(base); } + } + } + + std::vector missing; + for (const auto & c : checks) { + if (!c.found) { missing.push_back(c.label); } + } + if (!missing.empty()) { + fprintf(stderr, "\ncurrent asset files:\n"); + for (const auto & a : assets) { + fprintf(stderr, " %s\n", a.name.c_str()); + } + fprintf(stderr, "missing required asset(s):\n"); + for (const char * m : missing) { + fprintf(stderr, " %s\n", m); + } + fprintf(stderr, "hint: try cleaning your build directory: %s\n", in_dir.c_str()); + return 1; + } + } std::string h; - h += "#pragma once\n\n#include \n\n"; + h += "#pragma once\n\n#include \n#include \n\n"; if (n_assets > 0) { h += "#define LLAMA_UI_HAS_ASSETS 1\n\n"; } h += "struct llama_ui_asset {\n" - " const char * name;\n" + " std::string name;\n" " const unsigned char * data;\n" - " size_t size;\n" - " const char * etag;\n" + " std::size_t size;\n" + " std::string etag;\n" + " std::string type;\n" "};\n\n" - "const llama_ui_asset * llama_ui_find_asset(const char * name);\n"; + "const llama_ui_asset * llama_ui_find_asset(const std::string & name);\n" + "bool llama_ui_use_gzip();\n"; + h += fmt("const std::array & llama_ui_get_assets();\n", n_assets); std::string cpp; - cpp += "#include \"ui.h\"\n\n#include \n\n"; + cpp += "#include \"ui.h\"\n\n"; if (n_assets > 0) { for (int i = 0; i < n_assets; i++) { - const char * path = argv[3 + i * 2 + 1]; std::vector bytes; - if (!read_file(path, bytes)) { + if (!read_file(assets[i].path, bytes)) { + return 1; + } + if (bytes.empty()) { + fprintf(stderr, "embed: empty file: %s\n", assets[i].path.generic_string().c_str()); return 1; } cpp += fmt("static const unsigned char asset_%d_data[] = {", i); append_bytes_hex(cpp, bytes); const auto hash = fnv_hash(bytes.data(), bytes.size()); - cpp += fmt("};\nstatic const size_t asset_%d_size = %zu;\n", + cpp += fmt("};\nstatic const std::size_t asset_%d_size = %zu;\n", i, bytes.size()); - cpp += fmt("static const char asset_%d_etag[] = \"\\\"0x%016" PRIx64 "\\\"\";\n\n", + cpp += fmt("static const char asset_%d_etag[] = \"\\\"0x%016" PRIx64 "\\\"\";\n\n", i, hash); } - cpp += "static const llama_ui_asset g_assets[] = {\n"; + cpp += fmt("static const std::array g_assets = {{\n", n_assets); for (int i = 0; i < n_assets; i++) { - cpp += fmt(" { \"%s\", asset_%d_data, asset_%d_size, asset_%d_etag },\n", - argv[3 + i * 2], i, i, i); + const std::string & name = assets[i].name; + cpp += fmt(" { \"%s\", asset_%d_data, asset_%d_size, asset_%d_etag, \"%s\" },\n", + name.c_str(), i, i, i, mime_from_ext(name)); } - cpp += "};\n\n"; + cpp += "}};\n\n"; cpp += - "const llama_ui_asset * llama_ui_find_asset(const char * name) {\n" + "const llama_ui_asset * llama_ui_find_asset(const std::string & name) {\n" " for (const auto & a : g_assets) {\n" - " if (strcmp(a.name, name) == 0) {\n" + " if (a.name == name) {\n" " return &a;\n" " }\n" " }\n" " return nullptr;\n" "}\n"; + cpp += fmt("const std::array & llama_ui_get_assets() {\n", n_assets); + cpp += " return g_assets;\n" + "}\n"; } else { cpp += - "const llama_ui_asset * llama_ui_find_asset(const char *) {\n" + "const llama_ui_asset * llama_ui_find_asset(const std::string &) {\n" " return nullptr;\n" + "}\n" + "const std::array & llama_ui_get_assets() {\n" + " static const std::array empty{};\n" + " return empty;\n" "}\n"; } + cpp += fmt("bool llama_ui_use_gzip() { return %s; }\n", use_gzip ? "true" : "false"); bool ok = true; ok = write_if_different(out_h, h) && ok; diff --git a/tools/ui/package-lock.json b/tools/ui/package-lock.json index ffd4f6ca029a..9dce3a0c9d4b 100644 --- a/tools/ui/package-lock.json +++ b/tools/ui/package-lock.json @@ -26,29 +26,33 @@ "@tailwindcss/forms": "0.5.10", "@tailwindcss/typography": "0.5.16", "@tailwindcss/vite": "4.1.11", - "@types/node": "^24", + "@types/node": "24.13.0", + "@vite-pwa/assets-generator": "1.0.2", + "@vite-pwa/sveltekit": "1.1.0", "@vitest/browser": "4.1.8", "@vitest/browser-playwright": "4.1.8", "@vitest/coverage-v8": "4.1.8", "bits-ui": "2.18.1", "clsx": "2.1.1", - "dexie": "4.0.11", - "eslint": "9.39.2", + "dexie": "4.4.3", + "dompurify": "3.4.11", + "eslint": "9.39.4", "eslint-config-prettier": "10.1.8", - "eslint-plugin-storybook": "10.2.4", - "eslint-plugin-svelte": "3.15.0", - "globals": "16.3.0", + "eslint-plugin-storybook": "10.4.2", + "eslint-plugin-svelte": "3.19.0", + "fflate": "0.8.3", + "globals": "16.5.0", "highlight.js": "11.11.1", "http-server": "14.1.1", "mdast": "3.0.0", - "mdsvex": "0.12.6", + "mdsvex": "0.12.7", "mermaid": "11.15.0", "mode-watcher": "1.1.0", "pdfjs-dist": "5.4.54", "playwright": "1.56.1", - "prettier": "3.6.2", - "prettier-plugin-svelte": "3.4.0", - "prettier-plugin-tailwindcss": "0.6.14", + "prettier": "3.8.3", + "prettier-plugin-svelte": "4.1.0", + "prettier-plugin-tailwindcss": "0.8.0", "rehype-highlight": "7.0.2", "rehype-katex": "7.0.1", "rehype-stringify": "10.0.1", @@ -58,31 +62,31 @@ "remark-html": "16.0.1", "remark-math": "6.0.0", "remark-rehype": "11.1.2", - "sass": "1.93.3", - "storybook": "10.3.3", - "svelte": "5.55.7", - "svelte-check": "4.3.0", - "svelte-sonner": "1.0.5", - "tailwind-merge": "3.3.1", + "sass": "1.100.0", + "storybook": "10.4.2", + "svelte": "5.56.1", + "svelte-check": "4.6.0", + "svelte-sonner": "1.1.1", + "tailwind-merge": "3.6.0", "tailwind-variants": "3.2.2", - "tailwindcss": "4.1.11", - "tw-animate-css": "1.3.5", - "typescript": "5.8.3", - "typescript-eslint": "8.56.0", + "tailwindcss": "4.3.0", + "tw-animate-css": "1.4.0", + "typescript": "5.9.3", + "typescript-eslint": "8.60.1", "unified": "11.0.5", - "unist-util-visit": "5.0.0", + "unist-util-visit": "5.1.0", "uuid": "13.0.2", - "vite": "7.3.2", + "vite": "7.3.5", "vite-plugin-devtools-json": "0.2.1", "vitest": "4.1.8", "vitest-browser-svelte": "2.1.1", - "zod": "4.2.1" + "workbox-window": "7.4.1" } }, "node_modules/@adobe/css-tools": { - "version": "4.4.4", - "resolved": "https://registry.npmjs.org/@adobe/css-tools/-/css-tools-4.4.4.tgz", - "integrity": "sha512-Elp+iwUx5rN5+Y8xLt5/GRoG20WGoDCQ/1Fb+1LiGtvwbDavuSk0jhD/eZdckHAuzcDzccnkv+rEjyWfRx18gg==", + "version": "4.5.0", + "resolved": "https://registry.npmjs.org/@adobe/css-tools/-/css-tools-4.5.0.tgz", + "integrity": "sha512-6OzddxPio9UiWTCemp4N8cYLV2ZN1ncRnV1cVGtve7dhPOtRkleRyx32GQCYSwDYgaHU3USMm84tNsvKzRCa1Q==", "dev": true, "license": "MIT" }, @@ -114,13 +118,29 @@ "url": "https://github.com/sponsors/antfu" } }, + "node_modules/@apideck/better-ajv-errors": { + "version": "0.3.7", + "resolved": "https://registry.npmjs.org/@apideck/better-ajv-errors/-/better-ajv-errors-0.3.7.tgz", + "integrity": "sha512-TajUJwGWbDwkCx/CZi7tRE8PVB7simCvKJfHUsSdvps+aTM/PDPP4gkLmKnc+x3CE//y9i/nj74GqdL/hwk7Iw==", + "dev": true, + "license": "MIT", + "dependencies": { + "jsonpointer": "^5.0.1", + "leven": "^3.1.0" + }, + "engines": { + "node": ">=10" + }, + "peerDependencies": { + "ajv": ">=8" + } + }, "node_modules/@babel/code-frame": { "version": "7.29.7", "resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.29.7.tgz", "integrity": "sha512-Aup7aUOfpbAUg2ROOJN6Iw5f9DMBlzu0mIkm/malLQFN/YQgO48wCj0Kxa3sEHJvPVFg7siR+qRInwXd2qhQKw==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "@babel/helper-validator-identifier": "^7.29.7", "js-tokens": "^4.0.0", @@ -130,1729 +150,1792 @@ "node": ">=6.9.0" } }, - "node_modules/@babel/code-frame/node_modules/js-tokens": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-4.0.0.tgz", - "integrity": "sha512-RdJUflcE3cUzKiMqQgsCu06FPu9UdIJO0beYbPhHN4k6apgJtifcoCtT9bcxOpYBtpD2kCM6Sbzg4CausW/PKQ==", - "dev": true, - "license": "MIT", - "peer": true - }, - "node_modules/@babel/helper-string-parser": { + "node_modules/@babel/compat-data": { "version": "7.29.7", - "resolved": "https://registry.npmjs.org/@babel/helper-string-parser/-/helper-string-parser-7.29.7.tgz", - "integrity": "sha512-Pb5ijPrZ89GDH8223L4UP8i6QApWxs04RbPQJTeWDV0/keR2E36MeKnyr6LYmUUvqRRI+Iv87SuF1W6ErINzYw==", + "resolved": "https://registry.npmjs.org/@babel/compat-data/-/compat-data-7.29.7.tgz", + "integrity": "sha512-locTkQyKvwIEgBzVrn8693ebc97F2U8ZHjbXwDXJ5Fn2TCpNwTlKcaKLkdHop5c/icOFE7qt7Q9JC5hnKNa6Gg==", "dev": true, "license": "MIT", "engines": { "node": ">=6.9.0" } }, - "node_modules/@babel/helper-validator-identifier": { + "node_modules/@babel/core": { "version": "7.29.7", - "resolved": "https://registry.npmjs.org/@babel/helper-validator-identifier/-/helper-validator-identifier-7.29.7.tgz", - "integrity": "sha512-qehxGkRj55h/ff8EMaJ+cYhyaKlHIxqYDn682wQD7RNp9UujOQsHog2uS0r2vzr4pW+sXf90NeeayjcNaX3fFg==", + "resolved": "https://registry.npmjs.org/@babel/core/-/core-7.29.7.tgz", + "integrity": "sha512-RgHBCvtjbOK2gXSNBNIkNoEc9qoVEtau3hj8gEqKQuL3HZAibKarWFEI3Lfm6EYKkLalOh8eSrj9b+ch9H/VBA==", "dev": true, "license": "MIT", + "dependencies": { + "@babel/code-frame": "^7.29.7", + "@babel/generator": "^7.29.7", + "@babel/helper-compilation-targets": "^7.29.7", + "@babel/helper-module-transforms": "^7.29.7", + "@babel/helpers": "^7.29.7", + "@babel/parser": "^7.29.7", + "@babel/template": "^7.29.7", + "@babel/traverse": "^7.29.7", + "@babel/types": "^7.29.7", + "@jridgewell/remapping": "^2.3.5", + "convert-source-map": "^2.0.0", + "debug": "^4.1.0", + "gensync": "^1.0.0-beta.2", + "json5": "^2.2.3", + "semver": "^6.3.1" + }, "engines": { "node": ">=6.9.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/babel" } }, - "node_modules/@babel/parser": { + "node_modules/@babel/core/node_modules/semver": { + "version": "6.3.1", + "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz", + "integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==", + "dev": true, + "license": "ISC", + "bin": { + "semver": "bin/semver.js" + } + }, + "node_modules/@babel/generator": { "version": "7.29.7", - "resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.29.7.tgz", - "integrity": "sha512-hnORnjP/1P/zFEndoeX+n+t1RwWRJiJpM/jO7FW32Kn9r5+sJB2JWOdYo4L6k78j15eCwY3Gm/7364B1EMwtNg==", + "resolved": "https://registry.npmjs.org/@babel/generator/-/generator-7.29.7.tgz", + "integrity": "sha512-DkXD5OJQaAQIdZ1bt3UZdEnHAn9Imd3IVBdX03UFe+ony9Ojw5pzr9YVKGDY1jt+Gcn/FnGkNf8r+Vj5NOJWtQ==", "dev": true, "license": "MIT", "dependencies": { - "@babel/types": "^7.29.7" - }, - "bin": { - "parser": "bin/babel-parser.js" + "@babel/parser": "^7.29.7", + "@babel/types": "^7.29.7", + "@jridgewell/gen-mapping": "^0.3.12", + "@jridgewell/trace-mapping": "^0.3.28", + "jsesc": "^3.0.2" }, "engines": { - "node": ">=6.0.0" + "node": ">=6.9.0" } }, - "node_modules/@babel/runtime": { + "node_modules/@babel/helper-annotate-as-pure": { "version": "7.29.7", - "resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.29.7.tgz", - "integrity": "sha512-Nq8OhGWiZIZGV6hLHoyAKLLcJihP/xFeBMGJoUrxTX2psI8dCifzLhZISFb+VWS3wFMRDmCGw5R+dOySCqPLhw==", + "resolved": "https://registry.npmjs.org/@babel/helper-annotate-as-pure/-/helper-annotate-as-pure-7.29.7.tgz", + "integrity": "sha512-OoK6239jHPuSQOoS0kfTVKn0b/rVTk0seKq4Gd2UMLtmOVLjDC0ki3e+c90Trqv2gMfvJFqkiljrr568+qddiw==", "dev": true, "license": "MIT", - "peer": true, + "dependencies": { + "@babel/types": "^7.29.7" + }, "engines": { "node": ">=6.9.0" } }, - "node_modules/@babel/types": { + "node_modules/@babel/helper-compilation-targets": { "version": "7.29.7", - "resolved": "https://registry.npmjs.org/@babel/types/-/types-7.29.7.tgz", - "integrity": "sha512-4zBIxpPzowiZpusoFkyGVwakdRJUyuH5PxQ/PrqghfdFWWasvnCdPfQXHrenDai+gyLARulZjZowCOj6fjT4pA==", + "resolved": "https://registry.npmjs.org/@babel/helper-compilation-targets/-/helper-compilation-targets-7.29.7.tgz", + "integrity": "sha512-wem6WaBj4NaVYVdNhLPPVacES6ZJ+KBBfSkTMD3YZxbP3rm3Di85tJU5ljaUNhaOynt+Aj0xruhYuzQBt8n71g==", "dev": true, "license": "MIT", "dependencies": { - "@babel/helper-string-parser": "^7.29.7", - "@babel/helper-validator-identifier": "^7.29.7" + "@babel/compat-data": "^7.29.7", + "@babel/helper-validator-option": "^7.29.7", + "browserslist": "^4.24.0", + "lru-cache": "^5.1.1", + "semver": "^6.3.1" }, "engines": { "node": ">=6.9.0" } }, - "node_modules/@bcoe/v8-coverage": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/@bcoe/v8-coverage/-/v8-coverage-1.0.2.tgz", - "integrity": "sha512-6zABk/ECA/QYSCQ1NGiVwwbQerUCZ+TQbp64Q3AgmfNvurHH0j8TtXa1qbShXA6qqkpAj4V5W8pP6mLe1mcMqA==", + "node_modules/@babel/helper-compilation-targets/node_modules/lru-cache": { + "version": "5.1.1", + "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz", + "integrity": "sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==", "dev": true, - "license": "MIT", - "engines": { - "node": ">=18" + "license": "ISC", + "dependencies": { + "yallist": "^3.0.2" } }, - "node_modules/@blazediff/core": { - "version": "1.9.1", - "resolved": "https://registry.npmjs.org/@blazediff/core/-/core-1.9.1.tgz", - "integrity": "sha512-ehg3jIkYKulZh+8om/O25vkvSsXXwC+skXmyA87FFx6A/45eqOkZsBltMw/TVteb0mloiGT8oGRTcjRAz66zaA==", + "node_modules/@babel/helper-compilation-targets/node_modules/semver": { + "version": "6.3.1", + "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz", + "integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==", "dev": true, - "license": "MIT" + "license": "ISC", + "bin": { + "semver": "bin/semver.js" + } }, - "node_modules/@braintree/sanitize-url": { - "version": "7.1.2", - "resolved": "https://registry.npmjs.org/@braintree/sanitize-url/-/sanitize-url-7.1.2.tgz", - "integrity": "sha512-jigsZK+sMF/cuiB7sERuo9V7N9jx+dhmHHnQyDSVdpZwVutaBu7WvNYqMDLSgFgfB30n452TP3vjDAvFC973mA==", + "node_modules/@babel/helper-create-class-features-plugin": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/helper-create-class-features-plugin/-/helper-create-class-features-plugin-7.29.7.tgz", + "integrity": "sha512-IY3ZD9Tmooqr3TUhc3DUWxiuo8xx1DWLhd5M7hQ+ZWJamqM2BbalrBJb2MisSLoYorOj75U03qULCxQTY9r3hg==", "dev": true, - "license": "MIT" + "license": "MIT", + "dependencies": { + "@babel/helper-annotate-as-pure": "^7.29.7", + "@babel/helper-member-expression-to-functions": "^7.29.7", + "@babel/helper-optimise-call-expression": "^7.29.7", + "@babel/helper-replace-supers": "^7.29.7", + "@babel/helper-skip-transparent-expression-wrappers": "^7.29.7", + "@babel/traverse": "^7.29.7", + "semver": "^6.3.1" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0" + } }, - "node_modules/@chevrotain/types": { - "version": "11.1.2", - "resolved": "https://registry.npmjs.org/@chevrotain/types/-/types-11.1.2.tgz", - "integrity": "sha512-U+HFai5+zmJCkK86QsaJtoITlboZHBqrVketcO2ROv865xfCMSFpELQoz1GkX5GzME8pTa+3kbKrZHQtI0gdbw==", + "node_modules/@babel/helper-create-class-features-plugin/node_modules/semver": { + "version": "6.3.1", + "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz", + "integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==", "dev": true, - "license": "Apache-2.0" + "license": "ISC", + "bin": { + "semver": "bin/semver.js" + } }, - "node_modules/@chromatic-com/storybook": { - "version": "5.0.0", - "resolved": "https://registry.npmjs.org/@chromatic-com/storybook/-/storybook-5.0.0.tgz", - "integrity": "sha512-8wUsqL8kg6R5ue8XNE7Jv/iD1SuE4+6EXMIGIuE+T2loBITEACLfC3V8W44NJviCLusZRMWbzICddz0nU0bFaw==", + "node_modules/@babel/helper-create-regexp-features-plugin": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/helper-create-regexp-features-plugin/-/helper-create-regexp-features-plugin-7.29.7.tgz", + "integrity": "sha512-907Uymvqgg1dwUA+7IGwFAOSYzQOuzPXKNJ1yxzwPffzkYFg2q2eHi1fIOs6sXkG9NbIUMunnUlkYsfRFNvomg==", "dev": true, "license": "MIT", "dependencies": { - "@neoconfetti/react": "^1.0.0", - "chromatic": "^13.3.4", - "filesize": "^10.0.12", - "jsonfile": "^6.1.0", - "strip-ansi": "^7.1.0" + "@babel/helper-annotate-as-pure": "^7.29.7", + "regexpu-core": "^6.3.1", + "semver": "^6.3.1" }, "engines": { - "node": ">=20.0.0", - "yarn": ">=1.22.18" + "node": ">=6.9.0" }, "peerDependencies": { - "storybook": "^0.0.0-0 || ^10.1.0 || ^10.1.0-0 || ^10.2.0-0 || ^10.3.0-0" + "@babel/core": "^7.0.0" } }, - "node_modules/@esbuild/aix-ppc64": { - "version": "0.27.7", - "resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.27.7.tgz", - "integrity": "sha512-EKX3Qwmhz1eMdEJokhALr0YiD0lhQNwDqkPYyPhiSwKrh7/4KRjQc04sZ8db+5DVVnZ1LmbNDI1uAMPEUBnQPg==", - "cpu": [ - "ppc64" - ], + "node_modules/@babel/helper-create-regexp-features-plugin/node_modules/semver": { + "version": "6.3.1", + "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz", + "integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==", "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "aix" - ], - "engines": { - "node": ">=18" + "license": "ISC", + "bin": { + "semver": "bin/semver.js" } }, - "node_modules/@esbuild/android-arm": { - "version": "0.27.7", - "resolved": "https://registry.npmjs.org/@esbuild/android-arm/-/android-arm-0.27.7.tgz", - "integrity": "sha512-jbPXvB4Yj2yBV7HUfE2KHe4GJX51QplCN1pGbYjvsyCZbQmies29EoJbkEc+vYuU5o45AfQn37vZlyXy4YJ8RQ==", - "cpu": [ - "arm" - ], + "node_modules/@babel/helper-define-polyfill-provider": { + "version": "0.6.8", + "resolved": "https://registry.npmjs.org/@babel/helper-define-polyfill-provider/-/helper-define-polyfill-provider-0.6.8.tgz", + "integrity": "sha512-47UwBLPpQi1NoWzLuHNjRoHlYXMwIJoBf7MFou6viC/sIHWYygpvr0B6IAyh5sBdA2nr2LPIRww8lfaUVQINBA==", "dev": true, "license": "MIT", - "optional": true, - "os": [ - "android" - ], - "engines": { - "node": ">=18" + "dependencies": { + "@babel/helper-compilation-targets": "^7.28.6", + "@babel/helper-plugin-utils": "^7.28.6", + "debug": "^4.4.3", + "lodash.debounce": "^4.0.8", + "resolve": "^1.22.11" + }, + "peerDependencies": { + "@babel/core": "^7.4.0 || ^8.0.0-0 <8.0.0" } }, - "node_modules/@esbuild/android-arm64": { - "version": "0.27.7", - "resolved": "https://registry.npmjs.org/@esbuild/android-arm64/-/android-arm64-0.27.7.tgz", - "integrity": "sha512-62dPZHpIXzvChfvfLJow3q5dDtiNMkwiRzPylSCfriLvZeq0a1bWChrGx/BbUbPwOrsWKMn8idSllklzBy+dgQ==", - "cpu": [ - "arm64" - ], + "node_modules/@babel/helper-globals": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/helper-globals/-/helper-globals-7.29.7.tgz", + "integrity": "sha512-3nQVUAtvkKH9zahfWgw96Jc/uFOmjACE1kQz82E2lqWmHBgjzbNlsC22nuQTfahmWeQtTq5nQ/4Nnd2A1wj4zA==", "dev": true, "license": "MIT", - "optional": true, - "os": [ - "android" - ], "engines": { - "node": ">=18" + "node": ">=6.9.0" } }, - "node_modules/@esbuild/android-x64": { - "version": "0.27.7", - "resolved": "https://registry.npmjs.org/@esbuild/android-x64/-/android-x64-0.27.7.tgz", - "integrity": "sha512-x5VpMODneVDb70PYV2VQOmIUUiBtY3D3mPBG8NxVk5CogneYhkR7MmM3yR/uMdITLrC1ml/NV1rj4bMJuy9MCg==", - "cpu": [ - "x64" - ], + "node_modules/@babel/helper-member-expression-to-functions": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/helper-member-expression-to-functions/-/helper-member-expression-to-functions-7.29.7.tgz", + "integrity": "sha512-j+7JYmk1JYDtACIGj0QJqqWZjoUpMoEikQGADMaHgCMCSDqd2+P32rfcibUNrGOMWrlzK1WJBdxrB3JJQZwWtg==", "dev": true, "license": "MIT", - "optional": true, - "os": [ - "android" - ], + "dependencies": { + "@babel/traverse": "^7.29.7", + "@babel/types": "^7.29.7" + }, "engines": { - "node": ">=18" + "node": ">=6.9.0" } }, - "node_modules/@esbuild/darwin-arm64": { - "version": "0.27.7", - "resolved": "https://registry.npmjs.org/@esbuild/darwin-arm64/-/darwin-arm64-0.27.7.tgz", - "integrity": "sha512-5lckdqeuBPlKUwvoCXIgI2D9/ABmPq3Rdp7IfL70393YgaASt7tbju3Ac+ePVi3KDH6N2RqePfHnXkaDtY9fkw==", - "cpu": [ - "arm64" - ], + "node_modules/@babel/helper-module-imports": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/helper-module-imports/-/helper-module-imports-7.29.7.tgz", + "integrity": "sha512-ejHwrQQYcm9xnTivShn2IDOlIzInN34AXskvq9QicvCtEzq1Vzclu/tKF8Jq1Cg8JG2GL6/EmjgsCT7lXepE3g==", "dev": true, "license": "MIT", - "optional": true, - "os": [ - "darwin" - ], + "dependencies": { + "@babel/traverse": "^7.29.7", + "@babel/types": "^7.29.7" + }, "engines": { - "node": ">=18" + "node": ">=6.9.0" } }, - "node_modules/@esbuild/darwin-x64": { - "version": "0.27.7", - "resolved": "https://registry.npmjs.org/@esbuild/darwin-x64/-/darwin-x64-0.27.7.tgz", - "integrity": "sha512-rYnXrKcXuT7Z+WL5K980jVFdvVKhCHhUwid+dDYQpH+qu+TefcomiMAJpIiC2EM3Rjtq0sO3StMV/+3w3MyyqQ==", - "cpu": [ - "x64" - ], + "node_modules/@babel/helper-module-transforms": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/helper-module-transforms/-/helper-module-transforms-7.29.7.tgz", + "integrity": "sha512-UPUVSyXbOh627KiCIGQSgwWzGeBKLkaJ9PJEdrngIwMSzxLR4jS4+f1f1jb7VzBbg8nFLaYotvVPFCTqdrmTAg==", "dev": true, "license": "MIT", - "optional": true, - "os": [ - "darwin" - ], + "dependencies": { + "@babel/helper-module-imports": "^7.29.7", + "@babel/helper-validator-identifier": "^7.29.7", + "@babel/traverse": "^7.29.7" + }, "engines": { - "node": ">=18" + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0" } }, - "node_modules/@esbuild/freebsd-arm64": { - "version": "0.27.7", - "resolved": "https://registry.npmjs.org/@esbuild/freebsd-arm64/-/freebsd-arm64-0.27.7.tgz", - "integrity": "sha512-B48PqeCsEgOtzME2GbNM2roU29AMTuOIN91dsMO30t+Ydis3z/3Ngoj5hhnsOSSwNzS+6JppqWsuhTp6E82l2w==", - "cpu": [ - "arm64" - ], + "node_modules/@babel/helper-optimise-call-expression": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/helper-optimise-call-expression/-/helper-optimise-call-expression-7.29.7.tgz", + "integrity": "sha512-+kmGVjcT9RGYzoDwdwEqEvGgKe3BYq+O1iGzjFubaNgZHwYHP6lsF2Yghf4kEuv9BV7tYDZ913aBW9am6YKong==", "dev": true, "license": "MIT", - "optional": true, - "os": [ - "freebsd" - ], + "dependencies": { + "@babel/types": "^7.29.7" + }, "engines": { - "node": ">=18" + "node": ">=6.9.0" } }, - "node_modules/@esbuild/freebsd-x64": { - "version": "0.27.7", - "resolved": "https://registry.npmjs.org/@esbuild/freebsd-x64/-/freebsd-x64-0.27.7.tgz", - "integrity": "sha512-jOBDK5XEjA4m5IJK3bpAQF9/Lelu/Z9ZcdhTRLf4cajlB+8VEhFFRjWgfy3M1O4rO2GQ/b2dLwCUGpiF/eATNQ==", - "cpu": [ - "x64" - ], + "node_modules/@babel/helper-plugin-utils": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/helper-plugin-utils/-/helper-plugin-utils-7.29.7.tgz", + "integrity": "sha512-G7sHYigPY17oO5SYWnfD/0MTBwVR781S/JI643e/JhUYgVgWE/61SoW3NH9KWUKyKq5LVh3npif99Wkt6j86Jw==", "dev": true, "license": "MIT", - "optional": true, - "os": [ - "freebsd" - ], "engines": { - "node": ">=18" + "node": ">=6.9.0" } }, - "node_modules/@esbuild/linux-arm": { - "version": "0.27.7", - "resolved": "https://registry.npmjs.org/@esbuild/linux-arm/-/linux-arm-0.27.7.tgz", - "integrity": "sha512-RkT/YXYBTSULo3+af8Ib0ykH8u2MBh57o7q/DAs3lTJlyVQkgQvlrPTnjIzzRPQyavxtPtfg0EopvDyIt0j1rA==", - "cpu": [ - "arm" - ], + "node_modules/@babel/helper-remap-async-to-generator": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/helper-remap-async-to-generator/-/helper-remap-async-to-generator-7.29.7.tgz", + "integrity": "sha512-16AMiW26DbXWBbr3B8wNozKM0ydMLB892vaOaJW/fPJdnT8vJk5sdkQcU/isqUxyCE0cEoa8wZOcbgDuC4b6Og==", "dev": true, "license": "MIT", - "optional": true, - "os": [ - "linux" - ], + "dependencies": { + "@babel/helper-annotate-as-pure": "^7.29.7", + "@babel/helper-wrap-function": "^7.29.7", + "@babel/traverse": "^7.29.7" + }, "engines": { - "node": ">=18" + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0" } }, - "node_modules/@esbuild/linux-arm64": { - "version": "0.27.7", - "resolved": "https://registry.npmjs.org/@esbuild/linux-arm64/-/linux-arm64-0.27.7.tgz", - "integrity": "sha512-RZPHBoxXuNnPQO9rvjh5jdkRmVizktkT7TCDkDmQ0W2SwHInKCAV95GRuvdSvA7w4VMwfCjUiPwDi0ZO6Nfe9A==", - "cpu": [ - "arm64" - ], + "node_modules/@babel/helper-replace-supers": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/helper-replace-supers/-/helper-replace-supers-7.29.7.tgz", + "integrity": "sha512-atfGXWSeCiF4DnKZIfmJfQRkSw9b9gNNXR1kqKjbhG4pGYCOnkp8OcTB8E3NXjBu8NpheSnOeNKz8KT7UNFTmQ==", "dev": true, "license": "MIT", - "optional": true, - "os": [ - "linux" - ], + "dependencies": { + "@babel/helper-member-expression-to-functions": "^7.29.7", + "@babel/helper-optimise-call-expression": "^7.29.7", + "@babel/traverse": "^7.29.7" + }, "engines": { - "node": ">=18" + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0" } }, - "node_modules/@esbuild/linux-ia32": { - "version": "0.27.7", - "resolved": "https://registry.npmjs.org/@esbuild/linux-ia32/-/linux-ia32-0.27.7.tgz", - "integrity": "sha512-GA48aKNkyQDbd3KtkplYWT102C5sn/EZTY4XROkxONgruHPU72l+gW+FfF8tf2cFjeHaRbWpOYa/uRBz/Xq1Pg==", - "cpu": [ - "ia32" - ], + "node_modules/@babel/helper-skip-transparent-expression-wrappers": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/helper-skip-transparent-expression-wrappers/-/helper-skip-transparent-expression-wrappers-7.29.7.tgz", + "integrity": "sha512-brcMGQaVzIeUb+6/bs1Av0f8YuNNjKY2JyvfRCsFuFsdKccEQ5Ges2y74D74NZ1Rz8lKJ9ksJkfqwQFJ/iNEyQ==", "dev": true, "license": "MIT", - "optional": true, - "os": [ - "linux" - ], + "dependencies": { + "@babel/traverse": "^7.29.7", + "@babel/types": "^7.29.7" + }, "engines": { - "node": ">=18" + "node": ">=6.9.0" } }, - "node_modules/@esbuild/linux-loong64": { - "version": "0.27.7", - "resolved": "https://registry.npmjs.org/@esbuild/linux-loong64/-/linux-loong64-0.27.7.tgz", - "integrity": "sha512-a4POruNM2oWsD4WKvBSEKGIiWQF8fZOAsycHOt6JBpZ+JN2n2JH9WAv56SOyu9X5IqAjqSIPTaJkqN8F7XOQ5Q==", - "cpu": [ - "loong64" - ], + "node_modules/@babel/helper-string-parser": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/helper-string-parser/-/helper-string-parser-7.29.7.tgz", + "integrity": "sha512-Pb5ijPrZ89GDH8223L4UP8i6QApWxs04RbPQJTeWDV0/keR2E36MeKnyr6LYmUUvqRRI+Iv87SuF1W6ErINzYw==", "dev": true, "license": "MIT", - "optional": true, - "os": [ - "linux" - ], "engines": { - "node": ">=18" + "node": ">=6.9.0" } }, - "node_modules/@esbuild/linux-mips64el": { - "version": "0.27.7", - "resolved": "https://registry.npmjs.org/@esbuild/linux-mips64el/-/linux-mips64el-0.27.7.tgz", - "integrity": "sha512-KabT5I6StirGfIz0FMgl1I+R1H73Gp0ofL9A3nG3i/cYFJzKHhouBV5VWK1CSgKvVaG4q1RNpCTR2LuTVB3fIw==", - "cpu": [ - "mips64el" - ], + "node_modules/@babel/helper-validator-identifier": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/helper-validator-identifier/-/helper-validator-identifier-7.29.7.tgz", + "integrity": "sha512-qehxGkRj55h/ff8EMaJ+cYhyaKlHIxqYDn682wQD7RNp9UujOQsHog2uS0r2vzr4pW+sXf90NeeayjcNaX3fFg==", "dev": true, "license": "MIT", - "optional": true, - "os": [ - "linux" - ], "engines": { - "node": ">=18" + "node": ">=6.9.0" } }, - "node_modules/@esbuild/linux-ppc64": { - "version": "0.27.7", - "resolved": "https://registry.npmjs.org/@esbuild/linux-ppc64/-/linux-ppc64-0.27.7.tgz", - "integrity": "sha512-gRsL4x6wsGHGRqhtI+ifpN/vpOFTQtnbsupUF5R5YTAg+y/lKelYR1hXbnBdzDjGbMYjVJLJTd2OFmMewAgwlQ==", - "cpu": [ - "ppc64" - ], + "node_modules/@babel/helper-validator-option": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/helper-validator-option/-/helper-validator-option-7.29.7.tgz", + "integrity": "sha512-N9ZErrD+yW5geCDtBqnOoxmR8+tNKiGuxKlDpuJxfsqpa2dFcexaziGAE/qoHLiDDreVNMupxGmSoNlyvsA3gw==", "dev": true, "license": "MIT", - "optional": true, - "os": [ - "linux" - ], "engines": { - "node": ">=18" + "node": ">=6.9.0" } }, - "node_modules/@esbuild/linux-riscv64": { - "version": "0.27.7", - "resolved": "https://registry.npmjs.org/@esbuild/linux-riscv64/-/linux-riscv64-0.27.7.tgz", - "integrity": "sha512-hL25LbxO1QOngGzu2U5xeXtxXcW+/GvMN3ejANqXkxZ/opySAZMrc+9LY/WyjAan41unrR3YrmtTsUpwT66InQ==", - "cpu": [ - "riscv64" - ], + "node_modules/@babel/helper-wrap-function": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/helper-wrap-function/-/helper-wrap-function-7.29.7.tgz", + "integrity": "sha512-iES0Skag9ERIF68aXadpO6dbXa03mNWK3sEqJaMnLNs/eC3l0lkImdfoy6Y09/SfkpawdAB4RjQ7PVA7TcVGdw==", "dev": true, "license": "MIT", - "optional": true, - "os": [ - "linux" - ], + "dependencies": { + "@babel/template": "^7.29.7", + "@babel/traverse": "^7.29.7", + "@babel/types": "^7.29.7" + }, "engines": { - "node": ">=18" + "node": ">=6.9.0" } }, - "node_modules/@esbuild/linux-s390x": { - "version": "0.27.7", - "resolved": "https://registry.npmjs.org/@esbuild/linux-s390x/-/linux-s390x-0.27.7.tgz", - "integrity": "sha512-2k8go8Ycu1Kb46vEelhu1vqEP+UeRVj2zY1pSuPdgvbd5ykAw82Lrro28vXUrRmzEsUV0NzCf54yARIK8r0fdw==", - "cpu": [ - "s390x" - ], + "node_modules/@babel/helpers": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/helpers/-/helpers-7.29.7.tgz", + "integrity": "sha512-1k2lAGRMfHTcwuNYcCNUmaUffmQv8KWMfh2iJUUeRlwlwH4FdNG7mfPI10NPfLHJFThE4Tyr4mv7kTNZOiPuBg==", "dev": true, "license": "MIT", - "optional": true, - "os": [ - "linux" - ], + "dependencies": { + "@babel/template": "^7.29.7", + "@babel/types": "^7.29.7" + }, "engines": { - "node": ">=18" + "node": ">=6.9.0" } }, - "node_modules/@esbuild/linux-x64": { - "version": "0.27.7", - "resolved": "https://registry.npmjs.org/@esbuild/linux-x64/-/linux-x64-0.27.7.tgz", - "integrity": "sha512-hzznmADPt+OmsYzw1EE33ccA+HPdIqiCRq7cQeL1Jlq2gb1+OyWBkMCrYGBJ+sxVzve2ZJEVeePbLM2iEIZSxA==", - "cpu": [ - "x64" - ], + "node_modules/@babel/parser": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.29.7.tgz", + "integrity": "sha512-hnORnjP/1P/zFEndoeX+n+t1RwWRJiJpM/jO7FW32Kn9r5+sJB2JWOdYo4L6k78j15eCwY3Gm/7364B1EMwtNg==", "dev": true, "license": "MIT", - "optional": true, - "os": [ - "linux" - ], + "dependencies": { + "@babel/types": "^7.29.7" + }, + "bin": { + "parser": "bin/babel-parser.js" + }, "engines": { - "node": ">=18" + "node": ">=6.0.0" } }, - "node_modules/@esbuild/netbsd-arm64": { - "version": "0.27.7", - "resolved": "https://registry.npmjs.org/@esbuild/netbsd-arm64/-/netbsd-arm64-0.27.7.tgz", - "integrity": "sha512-b6pqtrQdigZBwZxAn1UpazEisvwaIDvdbMbmrly7cDTMFnw/+3lVxxCTGOrkPVnsYIosJJXAsILG9XcQS+Yu6w==", - "cpu": [ - "arm64" - ], + "node_modules/@babel/plugin-bugfix-firefox-class-in-computed-class-key": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-bugfix-firefox-class-in-computed-class-key/-/plugin-bugfix-firefox-class-in-computed-class-key-7.29.7.tgz", + "integrity": "sha512-j8SrR0zLZrRsC09DlszEx8FpMiwukKffYXMK0d5LmOglO7vGG6sz/BR/20yHqWH+Lnn31JTt2PE3hIWNgM2J6w==", "dev": true, "license": "MIT", - "optional": true, - "os": [ - "netbsd" - ], + "dependencies": { + "@babel/helper-plugin-utils": "^7.29.7", + "@babel/traverse": "^7.29.7" + }, "engines": { - "node": ">=18" + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0" } }, - "node_modules/@esbuild/netbsd-x64": { - "version": "0.27.7", - "resolved": "https://registry.npmjs.org/@esbuild/netbsd-x64/-/netbsd-x64-0.27.7.tgz", - "integrity": "sha512-OfatkLojr6U+WN5EDYuoQhtM+1xco+/6FSzJJnuWiUw5eVcicbyK3dq5EeV/QHT1uy6GoDhGbFpprUiHUYggrw==", - "cpu": [ - "x64" - ], + "node_modules/@babel/plugin-bugfix-safari-class-field-initializer-scope": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-bugfix-safari-class-field-initializer-scope/-/plugin-bugfix-safari-class-field-initializer-scope-7.29.7.tgz", + "integrity": "sha512-r8j8escF+U2FUHo0KOhPUdMzUO+jp9fInva6+ACVAF3Y97Ev+5iNZwiqTghmzNeWwDkOPlYuTcfb1vDaoZKmAQ==", "dev": true, "license": "MIT", - "optional": true, - "os": [ - "netbsd" - ], + "dependencies": { + "@babel/helper-plugin-utils": "^7.29.7" + }, "engines": { - "node": ">=18" + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0" } }, - "node_modules/@esbuild/openbsd-arm64": { - "version": "0.27.7", - "resolved": "https://registry.npmjs.org/@esbuild/openbsd-arm64/-/openbsd-arm64-0.27.7.tgz", - "integrity": "sha512-AFuojMQTxAz75Fo8idVcqoQWEHIXFRbOc1TrVcFSgCZtQfSdc1RXgB3tjOn/krRHENUB4j00bfGjyl2mJrU37A==", - "cpu": [ - "arm64" - ], + "node_modules/@babel/plugin-bugfix-safari-id-destructuring-collision-in-function-expression": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-bugfix-safari-id-destructuring-collision-in-function-expression/-/plugin-bugfix-safari-id-destructuring-collision-in-function-expression-7.29.7.tgz", + "integrity": "sha512-GE1TFSiuFeGsCxmYXZl8HwoPrVlwe4rHPFE8weieGKZqnDORK+Ar3vgWMgW+AOxQ6/2TgLSKx9p6W7O4rC6qgQ==", "dev": true, "license": "MIT", - "optional": true, - "os": [ - "openbsd" - ], + "dependencies": { + "@babel/helper-plugin-utils": "^7.29.7" + }, "engines": { - "node": ">=18" + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0" } }, - "node_modules/@esbuild/openbsd-x64": { - "version": "0.27.7", - "resolved": "https://registry.npmjs.org/@esbuild/openbsd-x64/-/openbsd-x64-0.27.7.tgz", - "integrity": "sha512-+A1NJmfM8WNDv5CLVQYJ5PshuRm/4cI6WMZRg1by1GwPIQPCTs1GLEUHwiiQGT5zDdyLiRM/l1G0Pv54gvtKIg==", - "cpu": [ - "x64" - ], + "node_modules/@babel/plugin-bugfix-safari-rest-destructuring-rhs-array": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-bugfix-safari-rest-destructuring-rhs-array/-/plugin-bugfix-safari-rest-destructuring-rhs-array-7.29.7.tgz", + "integrity": "sha512-oBNVCvnO5tND+xSopWvV8WNGfpTfgP4Zr/YXXSj8zfmcPktp5Ku/aZlsIowgSD4fjmgHn6sGmB9APVsU5zOdhA==", "dev": true, "license": "MIT", - "optional": true, - "os": [ - "openbsd" - ], + "dependencies": { + "@babel/helper-plugin-utils": "^7.29.7", + "@babel/helper-skip-transparent-expression-wrappers": "^7.29.7" + }, "engines": { - "node": ">=18" + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0" } }, - "node_modules/@esbuild/openharmony-arm64": { - "version": "0.27.7", - "resolved": "https://registry.npmjs.org/@esbuild/openharmony-arm64/-/openharmony-arm64-0.27.7.tgz", - "integrity": "sha512-+KrvYb/C8zA9CU/g0sR6w2RBw7IGc5J2BPnc3dYc5VJxHCSF1yNMxTV5LQ7GuKteQXZtspjFbiuW5/dOj7H4Yw==", - "cpu": [ - "arm64" - ], + "node_modules/@babel/plugin-bugfix-v8-spread-parameters-in-optional-chaining": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-bugfix-v8-spread-parameters-in-optional-chaining/-/plugin-bugfix-v8-spread-parameters-in-optional-chaining-7.29.7.tgz", + "integrity": "sha512-QQt9qKHZ2sg/kivaLr7lnQr8HVrQDdBNSfCsTjiDxRuX/K5ORyKq+Bu8Xr0cDE3Dfkv0cw28Ve0EKyKMvulkOw==", "dev": true, "license": "MIT", - "optional": true, - "os": [ - "openharmony" - ], + "dependencies": { + "@babel/helper-plugin-utils": "^7.29.7", + "@babel/helper-skip-transparent-expression-wrappers": "^7.29.7", + "@babel/plugin-transform-optional-chaining": "^7.29.7" + }, "engines": { - "node": ">=18" + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.13.0" } }, - "node_modules/@esbuild/sunos-x64": { - "version": "0.27.7", - "resolved": "https://registry.npmjs.org/@esbuild/sunos-x64/-/sunos-x64-0.27.7.tgz", - "integrity": "sha512-ikktIhFBzQNt/QDyOL580ti9+5mL/YZeUPKU2ivGtGjdTYoqz6jObj6nOMfhASpS4GU4Q/Clh1QtxWAvcYKamA==", - "cpu": [ - "x64" - ], + "node_modules/@babel/plugin-bugfix-v8-static-class-fields-redefine-readonly": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-bugfix-v8-static-class-fields-redefine-readonly/-/plugin-bugfix-v8-static-class-fields-redefine-readonly-7.29.7.tgz", + "integrity": "sha512-pn6QacGLgvCcwc+syUhKE/qSjV2D1IHDB84RNxWYSt1mW3K/SCtjinZ2p0cETJxAWBjPy3K/1lHwG5BjjPxNlw==", "dev": true, "license": "MIT", - "optional": true, - "os": [ - "sunos" - ], + "dependencies": { + "@babel/helper-plugin-utils": "^7.29.7", + "@babel/traverse": "^7.29.7" + }, "engines": { - "node": ">=18" + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0" } }, - "node_modules/@esbuild/win32-arm64": { - "version": "0.27.7", - "resolved": "https://registry.npmjs.org/@esbuild/win32-arm64/-/win32-arm64-0.27.7.tgz", - "integrity": "sha512-7yRhbHvPqSpRUV7Q20VuDwbjW5kIMwTHpptuUzV+AA46kiPze5Z7qgt6CLCK3pWFrHeNfDd1VKgyP4O+ng17CA==", - "cpu": [ - "arm64" - ], + "node_modules/@babel/plugin-proposal-private-property-in-object": { + "version": "7.21.0-placeholder-for-preset-env.2", + "resolved": "https://registry.npmjs.org/@babel/plugin-proposal-private-property-in-object/-/plugin-proposal-private-property-in-object-7.21.0-placeholder-for-preset-env.2.tgz", + "integrity": "sha512-SOSkfJDddaM7mak6cPEpswyTRnuRltl429hMraQEglW+OkovnCzsiszTmsrlY//qLFjCpQDFRvjdm2wA5pPm9w==", "dev": true, "license": "MIT", - "optional": true, - "os": [ - "win32" - ], "engines": { - "node": ">=18" + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" } }, - "node_modules/@esbuild/win32-ia32": { - "version": "0.27.7", - "resolved": "https://registry.npmjs.org/@esbuild/win32-ia32/-/win32-ia32-0.27.7.tgz", - "integrity": "sha512-SmwKXe6VHIyZYbBLJrhOoCJRB/Z1tckzmgTLfFYOfpMAx63BJEaL9ExI8x7v0oAO3Zh6D/Oi1gVxEYr5oUCFhw==", - "cpu": [ - "ia32" - ], + "node_modules/@babel/plugin-syntax-import-assertions": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-syntax-import-assertions/-/plugin-syntax-import-assertions-7.29.7.tgz", + "integrity": "sha512-/An1OCBN93thpBAGyfsK2pcf0jvju1SAtKkL2Ny++B5Sy6sqgzXDQH1cZxWbF96Wuk+bn41MDA9bLd4VVAw6rw==", "dev": true, "license": "MIT", - "optional": true, - "os": [ - "win32" - ], + "dependencies": { + "@babel/helper-plugin-utils": "^7.29.7" + }, "engines": { - "node": ">=18" + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" } }, - "node_modules/@esbuild/win32-x64": { - "version": "0.27.7", - "resolved": "https://registry.npmjs.org/@esbuild/win32-x64/-/win32-x64-0.27.7.tgz", - "integrity": "sha512-56hiAJPhwQ1R4i+21FVF7V8kSD5zZTdHcVuRFMW0hn753vVfQN8xlx4uOPT4xoGH0Z/oVATuR82AiqSTDIpaHg==", - "cpu": [ - "x64" - ], + "node_modules/@babel/plugin-syntax-import-attributes": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-syntax-import-attributes/-/plugin-syntax-import-attributes-7.29.7.tgz", + "integrity": "sha512-zGYcYfq/WmZ4V+kBIXQon9dSSc8ircGZqw9ZaNhhGj9nZkeBu1jHLBDQqYYi5WA9uawvA2sIMbry2nCFhf5Djg==", "dev": true, "license": "MIT", - "optional": true, - "os": [ - "win32" - ], + "dependencies": { + "@babel/helper-plugin-utils": "^7.29.7" + }, "engines": { - "node": ">=18" + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" } }, - "node_modules/@eslint-community/eslint-utils": { - "version": "4.9.1", - "resolved": "https://registry.npmjs.org/@eslint-community/eslint-utils/-/eslint-utils-4.9.1.tgz", - "integrity": "sha512-phrYmNiYppR7znFEdqgfWHXR6NCkZEK7hwWDHZUjit/2/U0r6XvkDl0SYnoM51Hq7FhCGdLDT6zxCCOY1hexsQ==", + "node_modules/@babel/plugin-syntax-unicode-sets-regex": { + "version": "7.18.6", + "resolved": "https://registry.npmjs.org/@babel/plugin-syntax-unicode-sets-regex/-/plugin-syntax-unicode-sets-regex-7.18.6.tgz", + "integrity": "sha512-727YkEAPwSIQTv5im8QHz3upqp92JTWhidIC81Tdx4VJYIte/VndKf1qKrfnnhPLiPghStWfvC/iFaMCQu7Nqg==", "dev": true, "license": "MIT", "dependencies": { - "eslint-visitor-keys": "^3.4.3" + "@babel/helper-create-regexp-features-plugin": "^7.18.6", + "@babel/helper-plugin-utils": "^7.18.6" }, "engines": { - "node": "^12.22.0 || ^14.17.0 || >=16.0.0" - }, - "funding": { - "url": "https://opencollective.com/eslint" + "node": ">=6.9.0" }, "peerDependencies": { - "eslint": "^6.0.0 || ^7.0.0 || >=8.0.0" + "@babel/core": "^7.0.0" } }, - "node_modules/@eslint-community/eslint-utils/node_modules/eslint-visitor-keys": { - "version": "3.4.3", - "resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-3.4.3.tgz", - "integrity": "sha512-wpc+LXeiyiisxPlEkUzU6svyS1frIO3Mgxj1fdy7Pm8Ygzguax2N3Fa/D/ag1WqbOprdI+uY6wMUl8/a2G+iag==", + "node_modules/@babel/plugin-transform-arrow-functions": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-arrow-functions/-/plugin-transform-arrow-functions-7.29.7.tgz", + "integrity": "sha512-N7zArUXWzAMzm+/N0uPBeVB3Fam5lMxtUwMmDK5f/IBBS7a7p1qeUoxd/6CckXoxUdgsntq1Dh8xNW06maZbDQ==", "dev": true, - "license": "Apache-2.0", + "license": "MIT", + "dependencies": { + "@babel/helper-plugin-utils": "^7.29.7" + }, "engines": { - "node": "^12.22.0 || ^14.17.0 || >=16.0.0" + "node": ">=6.9.0" }, - "funding": { - "url": "https://opencollective.com/eslint" + "peerDependencies": { + "@babel/core": "^7.0.0-0" } }, - "node_modules/@eslint-community/regexpp": { - "version": "4.12.2", - "resolved": "https://registry.npmjs.org/@eslint-community/regexpp/-/regexpp-4.12.2.tgz", - "integrity": "sha512-EriSTlt5OC9/7SXkRSCAhfSxxoSUgBm33OH+IkwbdpgoqsSsUg7y3uh+IICI/Qg4BBWr3U2i39RpmycbxMq4ew==", + "node_modules/@babel/plugin-transform-async-generator-functions": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-async-generator-functions/-/plugin-transform-async-generator-functions-7.29.7.tgz", + "integrity": "sha512-d98gXZkgswvkyohMBABkhm3GeXhYj8psWfwQ2C7gtfrKGTykQa/iOIi+JJhwMjPlZ6Vm2XN+DCf3Es1EoG4ZLA==", "dev": true, "license": "MIT", + "dependencies": { + "@babel/helper-plugin-utils": "^7.29.7", + "@babel/helper-remap-async-to-generator": "^7.29.7", + "@babel/traverse": "^7.29.7" + }, "engines": { - "node": "^12.0.0 || ^14.0.0 || >=16.0.0" + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" } }, - "node_modules/@eslint/compat": { - "version": "1.4.1", - "resolved": "https://registry.npmjs.org/@eslint/compat/-/compat-1.4.1.tgz", - "integrity": "sha512-cfO82V9zxxGBxcQDr1lfaYB7wykTa0b00mGa36FrJl7iTFd0Z2cHfEYuxcBRP/iNijCsWsEkA+jzT8hGYmv33w==", + "node_modules/@babel/plugin-transform-async-to-generator": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-async-to-generator/-/plugin-transform-async-to-generator-7.29.7.tgz", + "integrity": "sha512-pcUb2SS+RMo9TWVBwKGI5ShtoG7R+zBsFmCKDa6fe8c+hPr3XJlZgoE5j6i8W7gDjhyvy+85vmYexanvXh3d1w==", "dev": true, - "license": "Apache-2.0", + "license": "MIT", "dependencies": { - "@eslint/core": "^0.17.0" + "@babel/helper-module-imports": "^7.29.7", + "@babel/helper-plugin-utils": "^7.29.7", + "@babel/helper-remap-async-to-generator": "^7.29.7" }, "engines": { - "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + "node": ">=6.9.0" }, "peerDependencies": { - "eslint": "^8.40 || 9" - }, - "peerDependenciesMeta": { - "eslint": { - "optional": true - } + "@babel/core": "^7.0.0-0" } }, - "node_modules/@eslint/config-array": { - "version": "0.21.1", - "resolved": "https://registry.npmjs.org/@eslint/config-array/-/config-array-0.21.1.tgz", - "integrity": "sha512-aw1gNayWpdI/jSYVgzN5pL0cfzU02GT3NBpeT/DXbx1/1x7ZKxFPd9bwrzygx/qiwIQiJ1sw/zD8qY/kRvlGHA==", + "node_modules/@babel/plugin-transform-block-scoped-functions": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-block-scoped-functions/-/plugin-transform-block-scoped-functions-7.29.7.tgz", + "integrity": "sha512-cUSmjh72N+rN4PrkFlN1dJwNCwjVp5d38/CQrEsFggkD10UiFlBFgdH3tv5dNsLuHY+3S8db2xCHjhZcv5WgvA==", "dev": true, - "license": "Apache-2.0", + "license": "MIT", "dependencies": { - "@eslint/object-schema": "^2.1.7", - "debug": "^4.3.1", - "minimatch": "^3.1.2" + "@babel/helper-plugin-utils": "^7.29.7" }, "engines": { - "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" } }, - "node_modules/@eslint/config-helpers": { - "version": "0.4.2", - "resolved": "https://registry.npmjs.org/@eslint/config-helpers/-/config-helpers-0.4.2.tgz", - "integrity": "sha512-gBrxN88gOIf3R7ja5K9slwNayVcZgK6SOUORm2uBzTeIEfeVaIhOpCtTox3P6R7o2jLFwLFTLnC7kU/RGcYEgw==", + "node_modules/@babel/plugin-transform-block-scoping": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-block-scoping/-/plugin-transform-block-scoping-7.29.7.tgz", + "integrity": "sha512-ONyr4+AZhKh8yKWInVxU9AXA9EbsyeLcL6V0dJy6M2/62vuvpGm29zzuymbTpdc451GEpDIdAyPLP3r+P61yKQ==", "dev": true, - "license": "Apache-2.0", + "license": "MIT", "dependencies": { - "@eslint/core": "^0.17.0" + "@babel/helper-plugin-utils": "^7.29.7" }, "engines": { - "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" } }, - "node_modules/@eslint/core": { - "version": "0.17.0", - "resolved": "https://registry.npmjs.org/@eslint/core/-/core-0.17.0.tgz", - "integrity": "sha512-yL/sLrpmtDaFEiUj1osRP4TI2MDz1AddJL+jZ7KSqvBuliN4xqYY54IfdN8qD8Toa6g1iloph1fxQNkjOxrrpQ==", + "node_modules/@babel/plugin-transform-class-properties": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-class-properties/-/plugin-transform-class-properties-7.29.7.tgz", + "integrity": "sha512-GtcpjFvanPfzNQi3eTitsCqtRRmmqzpy/A+yhTR1HaZo1Ly3EA8ZXxlPyHdR8/IuRMYc3E4wdGBewB2QKQjAaA==", "dev": true, - "license": "Apache-2.0", + "license": "MIT", "dependencies": { - "@types/json-schema": "^7.0.15" + "@babel/helper-create-class-features-plugin": "^7.29.7", + "@babel/helper-plugin-utils": "^7.29.7" }, "engines": { - "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" } }, - "node_modules/@eslint/eslintrc": { - "version": "3.3.3", - "resolved": "https://registry.npmjs.org/@eslint/eslintrc/-/eslintrc-3.3.3.tgz", - "integrity": "sha512-Kr+LPIUVKz2qkx1HAMH8q1q6azbqBAsXJUxBl/ODDuVPX45Z9DfwB8tPjTi6nNZ8BuM3nbJxC5zCAg5elnBUTQ==", + "node_modules/@babel/plugin-transform-class-static-block": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-class-static-block/-/plugin-transform-class-static-block-7.29.7.tgz", + "integrity": "sha512-kibJgmEdX2iMwsHY2tSZNDgj8PwIlCQz7FK9KuGKO8zsuoUwSEhoNnNVp/emKWrbY4HeO6kkXfdMqRKKKXBm2A==", "dev": true, "license": "MIT", "dependencies": { - "ajv": "^6.12.4", - "debug": "^4.3.2", - "espree": "^10.0.1", - "globals": "^14.0.0", - "ignore": "^5.2.0", - "import-fresh": "^3.2.1", - "js-yaml": "^4.1.1", - "minimatch": "^3.1.2", - "strip-json-comments": "^3.1.1" + "@babel/helper-create-class-features-plugin": "^7.29.7", + "@babel/helper-plugin-utils": "^7.29.7" }, "engines": { - "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + "node": ">=6.9.0" }, - "funding": { - "url": "https://opencollective.com/eslint" + "peerDependencies": { + "@babel/core": "^7.12.0" } }, - "node_modules/@eslint/eslintrc/node_modules/globals": { - "version": "14.0.0", - "resolved": "https://registry.npmjs.org/globals/-/globals-14.0.0.tgz", - "integrity": "sha512-oahGvuMGQlPw/ivIYBjVSrWAfWLBeku5tpPE2fOPLi+WHffIWbuh2tCjhyQhTBPMf5E9jDEH4FOmTYgYwbKwtQ==", + "node_modules/@babel/plugin-transform-classes": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-classes/-/plugin-transform-classes-7.29.7.tgz", + "integrity": "sha512-qV0OGGBVacduzQHE649JyCneOFI/maT+YKsO+K4Yi3xv2wTPNjM/W2o2gdzMwEAZz7fXNTHAe0NcSg30bIN69g==", "dev": true, "license": "MIT", + "dependencies": { + "@babel/helper-annotate-as-pure": "^7.29.7", + "@babel/helper-compilation-targets": "^7.29.7", + "@babel/helper-globals": "^7.29.7", + "@babel/helper-plugin-utils": "^7.29.7", + "@babel/helper-replace-supers": "^7.29.7", + "@babel/traverse": "^7.29.7" + }, "engines": { - "node": ">=18" + "node": ">=6.9.0" }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" + "peerDependencies": { + "@babel/core": "^7.0.0-0" } }, - "node_modules/@eslint/js": { - "version": "9.39.2", - "resolved": "https://registry.npmjs.org/@eslint/js/-/js-9.39.2.tgz", - "integrity": "sha512-q1mjIoW1VX4IvSocvM/vbTiveKC4k9eLrajNEuSsmjymSDEbpGddtpfOoN7YGAqBK3NG+uqo8ia4PDTt8buCYA==", + "node_modules/@babel/plugin-transform-computed-properties": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-computed-properties/-/plugin-transform-computed-properties-7.29.7.tgz", + "integrity": "sha512-RK7/IyU5phpuCdBAuig5VkzG/EnbDaui5SQGdU9BFrHdV+mV4cUjLMQ9lJDjLNtWHsqtiefpGZUXQP2BiTYMsA==", "dev": true, "license": "MIT", + "dependencies": { + "@babel/helper-plugin-utils": "^7.29.7", + "@babel/template": "^7.29.7" + }, "engines": { - "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + "node": ">=6.9.0" }, - "funding": { - "url": "https://eslint.org/donate" + "peerDependencies": { + "@babel/core": "^7.0.0-0" } }, - "node_modules/@eslint/object-schema": { - "version": "2.1.7", - "resolved": "https://registry.npmjs.org/@eslint/object-schema/-/object-schema-2.1.7.tgz", - "integrity": "sha512-VtAOaymWVfZcmZbp6E2mympDIHvyjXs/12LqWYjVw6qjrfF+VK+fyG33kChz3nnK+SU5/NeHOqrTEHS8sXO3OA==", + "node_modules/@babel/plugin-transform-destructuring": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-destructuring/-/plugin-transform-destructuring-7.29.7.tgz", + "integrity": "sha512-iPX8aD6H9zV5s7ZsqTdNocPN/MGQ5sSMnElKrktxjJRMnB2jN/1p2+R7GkfD6CAYoVFqy5A4XnSIUeGgJzIWpg==", "dev": true, - "license": "Apache-2.0", + "license": "MIT", + "dependencies": { + "@babel/helper-plugin-utils": "^7.29.7", + "@babel/traverse": "^7.29.7" + }, "engines": { - "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" } }, - "node_modules/@eslint/plugin-kit": { - "version": "0.4.1", - "resolved": "https://registry.npmjs.org/@eslint/plugin-kit/-/plugin-kit-0.4.1.tgz", - "integrity": "sha512-43/qtrDUokr7LJqoF2c3+RInu/t4zfrpYdoSDfYyhg52rwLV6TnOvdG4fXm7IkSB3wErkcmJS9iEhjVtOSEjjA==", + "node_modules/@babel/plugin-transform-dotall-regex": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-dotall-regex/-/plugin-transform-dotall-regex-7.29.7.tgz", + "integrity": "sha512-3qc18hsD2RdZiyJNDNc7HQpv6xbncwh8FYtxNFFzclSyh/trPD9KkVR9BDECUjDLvb7yJVF15GfYUuC+LMkkiQ==", "dev": true, - "license": "Apache-2.0", + "license": "MIT", "dependencies": { - "@eslint/core": "^0.17.0", - "levn": "^0.4.1" + "@babel/helper-create-regexp-features-plugin": "^7.29.7", + "@babel/helper-plugin-utils": "^7.29.7" }, "engines": { - "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" } }, - "node_modules/@floating-ui/core": { - "version": "1.7.2", - "resolved": "https://registry.npmjs.org/@floating-ui/core/-/core-1.7.2.tgz", - "integrity": "sha512-wNB5ooIKHQc+Kui96jE/n69rHFWAVoxn5CAzL1Xdd8FG03cgY3MLO+GF9U3W737fYDSgPWA6MReKhBQBop6Pcw==", + "node_modules/@babel/plugin-transform-duplicate-keys": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-duplicate-keys/-/plugin-transform-duplicate-keys-7.29.7.tgz", + "integrity": "sha512-6IvRRriEMqnBwD6chtxdLpMYCHWEzN+oL5cyQtjykya19UgzbmKhxmhZgKC/LHxS2nYr9Q/qYPZ5Lr6jOL9+yQ==", "dev": true, "license": "MIT", "dependencies": { - "@floating-ui/utils": "^0.2.10" + "@babel/helper-plugin-utils": "^7.29.7" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" } }, - "node_modules/@floating-ui/dom": { - "version": "1.7.2", - "resolved": "https://registry.npmjs.org/@floating-ui/dom/-/dom-1.7.2.tgz", - "integrity": "sha512-7cfaOQuCS27HD7DX+6ib2OrnW+b4ZBwDNnCcT0uTyidcmyWb03FnQqJybDBoCnpdxwBSfA94UAYlRCt7mV+TbA==", + "node_modules/@babel/plugin-transform-duplicate-named-capturing-groups-regex": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-duplicate-named-capturing-groups-regex/-/plugin-transform-duplicate-named-capturing-groups-regex-7.29.7.tgz", + "integrity": "sha512-2wiIyo2BjtgU7HufSeDnL9L2O7zr8jmhFKuSr65VpRkUiRKRNpb0mdlk56+XPPKoIrfHqzbMuglDvZun0RISsA==", "dev": true, "license": "MIT", "dependencies": { - "@floating-ui/core": "^1.7.2", - "@floating-ui/utils": "^0.2.10" + "@babel/helper-create-regexp-features-plugin": "^7.29.7", + "@babel/helper-plugin-utils": "^7.29.7" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0" } }, - "node_modules/@floating-ui/utils": { - "version": "0.2.10", - "resolved": "https://registry.npmjs.org/@floating-ui/utils/-/utils-0.2.10.tgz", - "integrity": "sha512-aGTxbpbg8/b5JfU1HXSrbH3wXZuLPJcNEcZQFMxLs3oSzgtVu6nFPkbbGGUvBcUjKV2YyB9Wxxabo+HEH9tcRQ==", - "dev": true, - "license": "MIT" - }, - "node_modules/@hono/node-server": { - "version": "1.19.13", - "resolved": "https://registry.npmjs.org/@hono/node-server/-/node-server-1.19.13.tgz", - "integrity": "sha512-TsQLe4i2gvoTtrHje625ngThGBySOgSK3Xo2XRYOdqGN1teR8+I7vchQC46uLJi8OF62YTYA3AhSpumtkhsaKQ==", + "node_modules/@babel/plugin-transform-dynamic-import": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-dynamic-import/-/plugin-transform-dynamic-import-7.29.7.tgz", + "integrity": "sha512-giOlEm/EFjfjr+te9NsdjkUo2v4f8rS/SXPumRVHAtbNcyNlvtREkU1dZzaIDclNpnaVhlCqRdFKhJBjBikzLg==", "dev": true, "license": "MIT", + "dependencies": { + "@babel/helper-plugin-utils": "^7.29.7" + }, "engines": { - "node": ">=18.14.1" + "node": ">=6.9.0" }, "peerDependencies": { - "hono": "^4" + "@babel/core": "^7.0.0-0" } }, - "node_modules/@humanfs/core": { - "version": "0.19.1", - "resolved": "https://registry.npmjs.org/@humanfs/core/-/core-0.19.1.tgz", - "integrity": "sha512-5DyQ4+1JEUzejeK1JGICcideyfUbGixgS9jNgex5nqkW+cY7WZhxBigmieN5Qnw9ZosSNVC9KQKyb+GUaGyKUA==", + "node_modules/@babel/plugin-transform-explicit-resource-management": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-explicit-resource-management/-/plugin-transform-explicit-resource-management-7.29.7.tgz", + "integrity": "sha512-Rstj7coNz8sE+7Ju7ihpHLI564lsK5pUpNNlvptCIC/16E/S5hbl6n3kESPKdNRmqEWlpn5xpS5Q2dvXBsySLw==", "dev": true, - "license": "Apache-2.0", + "license": "MIT", + "dependencies": { + "@babel/helper-plugin-utils": "^7.29.7", + "@babel/plugin-transform-destructuring": "^7.29.7" + }, "engines": { - "node": ">=18.18.0" + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" } }, - "node_modules/@humanfs/node": { - "version": "0.16.6", - "resolved": "https://registry.npmjs.org/@humanfs/node/-/node-0.16.6.tgz", - "integrity": "sha512-YuI2ZHQL78Q5HbhDiBA1X4LmYdXCKCMQIfw0pw7piHJwyREFebJUvrQN4cMssyES6x+vfUbx1CIpaQUKYdQZOw==", + "node_modules/@babel/plugin-transform-exponentiation-operator": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-exponentiation-operator/-/plugin-transform-exponentiation-operator-7.29.7.tgz", + "integrity": "sha512-zFpMOTLZBdW5LfObqcSbL6kefg4R4eLdmvS0wbN9M6D5Mym/sKm9toOoWyVOa+xDjvCnuWcHls2YonXwHvH3CQ==", "dev": true, - "license": "Apache-2.0", + "license": "MIT", "dependencies": { - "@humanfs/core": "^0.19.1", - "@humanwhocodes/retry": "^0.3.0" + "@babel/helper-plugin-utils": "^7.29.7" }, "engines": { - "node": ">=18.18.0" + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" } }, - "node_modules/@humanfs/node/node_modules/@humanwhocodes/retry": { - "version": "0.3.1", - "resolved": "https://registry.npmjs.org/@humanwhocodes/retry/-/retry-0.3.1.tgz", - "integrity": "sha512-JBxkERygn7Bv/GbN5Rv8Ul6LVknS+5Bp6RgDC/O8gEBU/yeH5Ui5C/OlWrTb6qct7LjjfT6Re2NxB0ln0yYybA==", + "node_modules/@babel/plugin-transform-export-namespace-from": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-export-namespace-from/-/plugin-transform-export-namespace-from-7.29.7.tgz", + "integrity": "sha512-24B2nOy2TeJSMheqwPD4DDQOV/elLSIlKxjZt4i05H5AgdPdWR3n18HnNrcJ+j76WJd9gbwb9jPjNYUy6RautA==", "dev": true, - "license": "Apache-2.0", + "license": "MIT", + "dependencies": { + "@babel/helper-plugin-utils": "^7.29.7" + }, "engines": { - "node": ">=18.18" + "node": ">=6.9.0" }, - "funding": { - "type": "github", - "url": "https://github.com/sponsors/nzakas" + "peerDependencies": { + "@babel/core": "^7.0.0-0" } }, - "node_modules/@humanwhocodes/module-importer": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/@humanwhocodes/module-importer/-/module-importer-1.0.1.tgz", - "integrity": "sha512-bxveV4V8v5Yb4ncFTT3rPSgZBOpCkjfK0y4oVVVJwIuDVBRMDXrPyXRL988i5ap9m9bnyEEjWfm5WkBmtffLfA==", + "node_modules/@babel/plugin-transform-for-of": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-for-of/-/plugin-transform-for-of-7.29.7.tgz", + "integrity": "sha512-zeSIHh0+E1Um1WJRXCFlHQYu2ieJNdivLLjlBEp+dIBu3S51n+SZZmIXjxnItw6pz56Cn+KvK68BIBVsxq2JiQ==", "dev": true, - "license": "Apache-2.0", + "license": "MIT", + "dependencies": { + "@babel/helper-plugin-utils": "^7.29.7", + "@babel/helper-skip-transparent-expression-wrappers": "^7.29.7" + }, "engines": { - "node": ">=12.22" + "node": ">=6.9.0" }, - "funding": { - "type": "github", - "url": "https://github.com/sponsors/nzakas" + "peerDependencies": { + "@babel/core": "^7.0.0-0" } }, - "node_modules/@humanwhocodes/retry": { - "version": "0.4.3", - "resolved": "https://registry.npmjs.org/@humanwhocodes/retry/-/retry-0.4.3.tgz", - "integrity": "sha512-bV0Tgo9K4hfPCek+aMAn81RppFKv2ySDQeMoSZuvTASywNTnVJCArCZE2FWqpvIatKu7VMRLWlR1EazvVhDyhQ==", + "node_modules/@babel/plugin-transform-function-name": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-function-name/-/plugin-transform-function-name-7.29.7.tgz", + "integrity": "sha512-otRWaHXE6fbAGkePvaj/kvs3HsqXfPhlnzwSOlnFgbqCPMd975dW+4wZ00WFBt+/YlBGcJwNrARQTOJOb4ZrIg==", "dev": true, - "license": "Apache-2.0", + "license": "MIT", + "dependencies": { + "@babel/helper-compilation-targets": "^7.29.7", + "@babel/helper-plugin-utils": "^7.29.7", + "@babel/traverse": "^7.29.7" + }, "engines": { - "node": ">=18.18" + "node": ">=6.9.0" }, - "funding": { - "type": "github", - "url": "https://github.com/sponsors/nzakas" + "peerDependencies": { + "@babel/core": "^7.0.0-0" } }, - "node_modules/@iconify/types": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/@iconify/types/-/types-2.0.0.tgz", - "integrity": "sha512-+wluvCrRhXrhyOmRDJ3q8mux9JkKy5SJ/v8ol2tu4FVjyYvtEzkc/3pK15ET6RKg4b4w4BmTk1+gsCUhf21Ykg==", - "dev": true, - "license": "MIT" - }, - "node_modules/@iconify/utils": { - "version": "3.1.3", - "resolved": "https://registry.npmjs.org/@iconify/utils/-/utils-3.1.3.tgz", - "integrity": "sha512-LPKOXPn/zV+zis1oOfGWogaXVpqUybF3ZS6SCZIsz8vg0ivVp9+fVqyYB7xq0aiST/VhUQYGO1qo6uoYSiEJqw==", + "node_modules/@babel/plugin-transform-json-strings": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-json-strings/-/plugin-transform-json-strings-7.29.7.tgz", + "integrity": "sha512-RRnE2+eon1rJAq8MnoF1b5kTpY1vU88twHcvcKMrsqP/jxIRqDVs9iJB5fqPuqyeFAW0wJo4MlUIPpQCq/aRsg==", "dev": true, "license": "MIT", "dependencies": { - "@antfu/install-pkg": "^1.1.0", - "@iconify/types": "^2.0.0", - "import-meta-resolve": "^4.2.0" + "@babel/helper-plugin-utils": "^7.29.7" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" } }, - "node_modules/@internationalized/date": { - "version": "3.10.1", - "resolved": "https://registry.npmjs.org/@internationalized/date/-/date-3.10.1.tgz", - "integrity": "sha512-oJrXtQiAXLvT9clCf1K4kxp3eKsQhIaZqxEyowkBcsvZDdZkbWrVmnGknxs5flTD0VGsxrxKgBCZty1EzoiMzA==", + "node_modules/@babel/plugin-transform-literals": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-literals/-/plugin-transform-literals-7.29.7.tgz", + "integrity": "sha512-DZ/oLP21ZuWx1vKqnoNv6/tvEK48AQOBRai40CX9dTjGluvT/YZCyY3rryDtyUqCEoyNroy5KKPwX2iQCiRvyw==", "dev": true, - "license": "Apache-2.0", + "license": "MIT", "dependencies": { - "@swc/helpers": "^0.5.0" + "@babel/helper-plugin-utils": "^7.29.7" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" } }, - "node_modules/@isaacs/fs-minipass": { - "version": "4.0.1", - "resolved": "https://registry.npmjs.org/@isaacs/fs-minipass/-/fs-minipass-4.0.1.tgz", - "integrity": "sha512-wgm9Ehl2jpeqP3zw/7mo3kRHFp5MEDhqAdwy1fTGkHAwnkGOVsgpvQhL8B5n1qlb01jV3n/bI0ZfZp5lWA1k4w==", + "node_modules/@babel/plugin-transform-logical-assignment-operators": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-logical-assignment-operators/-/plugin-transform-logical-assignment-operators-7.29.7.tgz", + "integrity": "sha512-A0H91hh6W8MFRkp5TqJmMr39jzGD1A1E1Ysiv2O06Sfbhkapm+XyIzxWCEh5kqwOZ1/8QZ0dY3SeQ7XBqfJd5Q==", "dev": true, - "license": "ISC", + "license": "MIT", "dependencies": { - "minipass": "^7.0.4" + "@babel/helper-plugin-utils": "^7.29.7" }, "engines": { - "node": ">=18.0.0" + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" } }, - "node_modules/@jridgewell/gen-mapping": { - "version": "0.3.12", - "resolved": "https://registry.npmjs.org/@jridgewell/gen-mapping/-/gen-mapping-0.3.12.tgz", - "integrity": "sha512-OuLGC46TjB5BbN1dH8JULVVZY4WTdkF7tV9Ys6wLL1rubZnCMstOhNHueU5bLCrnRuDhKPDM4g6sw4Bel5Gzqg==", + "node_modules/@babel/plugin-transform-member-expression-literals": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-member-expression-literals/-/plugin-transform-member-expression-literals-7.29.7.tgz", + "integrity": "sha512-hl1kwFZCCiDyfH25Xmco9jTrkPgnS9pmOzSG7W5I4SaGbLeqKv417hcU2RKmaxoPEgsoJh7ZPOrnPGq99bHoUg==", "dev": true, "license": "MIT", "dependencies": { - "@jridgewell/sourcemap-codec": "^1.5.0", - "@jridgewell/trace-mapping": "^0.3.24" + "@babel/helper-plugin-utils": "^7.29.7" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" } }, - "node_modules/@jridgewell/remapping": { - "version": "2.3.5", - "resolved": "https://registry.npmjs.org/@jridgewell/remapping/-/remapping-2.3.5.tgz", - "integrity": "sha512-LI9u/+laYG4Ds1TDKSJW2YPrIlcVYOwi2fUC6xB43lueCjgxV4lffOCZCtYFiH6TNOX+tQKXx97T4IKHbhyHEQ==", + "node_modules/@babel/plugin-transform-modules-amd": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-modules-amd/-/plugin-transform-modules-amd-7.29.7.tgz", + "integrity": "sha512-fxtQoH3m5ywUSIfaH0FGCzWu4McsYon5bD3K4XnskC7f+OyQMj7rsOMi4NvvmJ83WwBAg4UCe+ov4VZlqEvyew==", "dev": true, "license": "MIT", "dependencies": { - "@jridgewell/gen-mapping": "^0.3.5", - "@jridgewell/trace-mapping": "^0.3.24" + "@babel/helper-module-transforms": "^7.29.7", + "@babel/helper-plugin-utils": "^7.29.7" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" } }, - "node_modules/@jridgewell/resolve-uri": { - "version": "3.1.2", - "resolved": "https://registry.npmjs.org/@jridgewell/resolve-uri/-/resolve-uri-3.1.2.tgz", - "integrity": "sha512-bRISgCIjP20/tbWSPWMEi54QVPRZExkuD9lJL+UIxUKtwVJA8wW1Trb1jMs1RFXo1CBTNZ/5hpC9QvmKWdopKw==", + "node_modules/@babel/plugin-transform-modules-commonjs": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-modules-commonjs/-/plugin-transform-modules-commonjs-7.29.7.tgz", + "integrity": "sha512-j0vCldybPC5b5dwCQOJ21uKtHzt7hxLygJTg9eF1ScfaikEDNfzn94XoW5Fi+seBR0nCyL23xaBFFkq7dTM8XQ==", "dev": true, "license": "MIT", + "dependencies": { + "@babel/helper-module-transforms": "^7.29.7", + "@babel/helper-plugin-utils": "^7.29.7" + }, "engines": { - "node": ">=6.0.0" + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" } }, - "node_modules/@jridgewell/sourcemap-codec": { - "version": "1.5.5", - "resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.5.5.tgz", - "integrity": "sha512-cYQ9310grqxueWbl+WuIUIaiUaDcj7WOq5fVhEljNVgRfOUhY9fy2zTvfoqWsnebh8Sl70VScFbICvJnLKB0Og==", - "dev": true, - "license": "MIT" - }, - "node_modules/@jridgewell/trace-mapping": { - "version": "0.3.31", - "resolved": "https://registry.npmjs.org/@jridgewell/trace-mapping/-/trace-mapping-0.3.31.tgz", - "integrity": "sha512-zzNR+SdQSDJzc8joaeP8QQoCQr8NuYx2dIIytl1QeBEZHJ9uW6hebsrYgbz8hJwUQao3TWCMtmfV8Nu1twOLAw==", + "node_modules/@babel/plugin-transform-modules-systemjs": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-modules-systemjs/-/plugin-transform-modules-systemjs-7.29.7.tgz", + "integrity": "sha512-TM2ZcQLoG2/y4HODiStCo10DibYhWhGWAwVv+EQKmG/7GFl0N+AAmUiXOMKM+aiJ9XBJ9AHVZBvTzMnJ2sM3cQ==", "dev": true, "license": "MIT", "dependencies": { - "@jridgewell/resolve-uri": "^3.1.0", - "@jridgewell/sourcemap-codec": "^1.4.14" + "@babel/helper-module-transforms": "^7.29.7", + "@babel/helper-plugin-utils": "^7.29.7", + "@babel/helper-validator-identifier": "^7.29.7", + "@babel/traverse": "^7.29.7" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" } }, - "node_modules/@lucide/svelte": { - "version": "0.515.0", - "resolved": "https://registry.npmjs.org/@lucide/svelte/-/svelte-0.515.0.tgz", - "integrity": "sha512-CEAyqcZmNBfYzVgaRmK2RFJP5tnbXxekRyDk0XX/eZQRfsJmkDvmQwXNX8C869BgNeryzmrRyjHhUL6g9ZOHNA==", + "node_modules/@babel/plugin-transform-modules-umd": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-modules-umd/-/plugin-transform-modules-umd-7.29.7.tgz", + "integrity": "sha512-B4UkaTK3QpgCwJnrxKfMPKdo92CN7OKXAlpAAnM3UPu0Q0lCCk57ylA9AJbRy2v8dDKOPAAWcoR6CMyeoHwRCA==", "dev": true, - "license": "ISC", + "license": "MIT", + "dependencies": { + "@babel/helper-module-transforms": "^7.29.7", + "@babel/helper-plugin-utils": "^7.29.7" + }, + "engines": { + "node": ">=6.9.0" + }, "peerDependencies": { - "svelte": "^5" + "@babel/core": "^7.0.0-0" } }, - "node_modules/@mdx-js/react": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/@mdx-js/react/-/react-3.1.0.tgz", - "integrity": "sha512-QjHtSaoameoalGnKDT3FoIl4+9RwyTmo9ZJGBdLOks/YOiWHoRDI3PUwEzOE7kEmGcV3AFcp9K6dYu9rEuKLAQ==", + "node_modules/@babel/plugin-transform-named-capturing-groups-regex": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-named-capturing-groups-regex/-/plugin-transform-named-capturing-groups-regex-7.29.7.tgz", + "integrity": "sha512-vuFoLwr4qnv2xbZ16SQd6uPcH5FNrLHhk/Jzo++0XJFcaDsr4gjJVg6j398oMHiC+83k/GiBzviwF5KBJkPUtQ==", "dev": true, "license": "MIT", "dependencies": { - "@types/mdx": "^2.0.0" + "@babel/helper-create-regexp-features-plugin": "^7.29.7", + "@babel/helper-plugin-utils": "^7.29.7" }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" + "engines": { + "node": ">=6.9.0" }, "peerDependencies": { - "@types/react": ">=16", - "react": ">=16" + "@babel/core": "^7.0.0" } }, - "node_modules/@mermaid-js/parser": { - "version": "1.1.1", - "resolved": "https://registry.npmjs.org/@mermaid-js/parser/-/parser-1.1.1.tgz", - "integrity": "sha512-VuHdsYMK1bT6X2JbcAaWAhugTRvRBRyuZgd+c22swUeI9g/ntaxF7CY7dYarhZovofCbUNO0G7JesfmNtjYOCw==", + "node_modules/@babel/plugin-transform-new-target": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-new-target/-/plugin-transform-new-target-7.29.7.tgz", + "integrity": "sha512-fEo41GmsOUhOBlw8ioo6zvjX5Xc2Lqkzlyfqbpsk3eB6TReV18uhxZ0esfEokVbY2+PVJAQHNKxER6lGrzNd3A==", "dev": true, "license": "MIT", "dependencies": { - "@chevrotain/types": "~11.1.1" + "@babel/helper-plugin-utils": "^7.29.7" + }, + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" } }, - "node_modules/@modelcontextprotocol/sdk": { - "version": "1.26.0", - "resolved": "https://registry.npmjs.org/@modelcontextprotocol/sdk/-/sdk-1.26.0.tgz", - "integrity": "sha512-Y5RmPncpiDtTXDbLKswIJzTqu2hyBKxTNsgKqKclDbhIgg1wgtf1fRuvxgTnRfcnxtvvgbIEcqUOzZrJ6iSReg==", + "node_modules/@babel/plugin-transform-nullish-coalescing-operator": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-nullish-coalescing-operator/-/plugin-transform-nullish-coalescing-operator-7.29.7.tgz", + "integrity": "sha512-idmp1dFaekP9GbcMvG24Kvw2BfhFZjHnNJCkV4WuIY4PskJzwI3f1N5OdgYke38T7rftO6ERulFRn2cFeZwRkg==", "dev": true, "license": "MIT", "dependencies": { - "@hono/node-server": "^1.19.9", - "ajv": "^8.17.1", - "ajv-formats": "^3.0.1", - "content-type": "^1.0.5", - "cors": "^2.8.5", - "cross-spawn": "^7.0.5", - "eventsource": "^3.0.2", - "eventsource-parser": "^3.0.0", - "express": "^5.2.1", - "express-rate-limit": "^8.2.1", - "hono": "^4.11.4", - "jose": "^6.1.3", - "json-schema-typed": "^8.0.2", - "pkce-challenge": "^5.0.0", - "raw-body": "^3.0.0", - "zod": "^3.25 || ^4.0", - "zod-to-json-schema": "^3.25.1" + "@babel/helper-plugin-utils": "^7.29.7" }, "engines": { - "node": ">=18" + "node": ">=6.9.0" }, "peerDependencies": { - "@cfworker/json-schema": "^4.1.1", - "zod": "^3.25 || ^4.0" - }, - "peerDependenciesMeta": { - "@cfworker/json-schema": { - "optional": true - }, - "zod": { - "optional": false - } + "@babel/core": "^7.0.0-0" } }, - "node_modules/@modelcontextprotocol/sdk/node_modules/ajv": { - "version": "8.18.0", - "resolved": "https://registry.npmjs.org/ajv/-/ajv-8.18.0.tgz", - "integrity": "sha512-PlXPeEWMXMZ7sPYOHqmDyCJzcfNrUr3fGNKtezX14ykXOEIvyK81d+qydx89KY5O71FKMPaQ2vBfBFI5NHR63A==", + "node_modules/@babel/plugin-transform-numeric-separator": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-numeric-separator/-/plugin-transform-numeric-separator-7.29.7.tgz", + "integrity": "sha512-zR7fv/z14OjgHl4AgRtkDBvBMhIzCxqV/qN/2BCRC7LjFwvuzjYe7gDWxC4Wl/SNsLM6SE1IWvRPYMgSJaUvNw==", "dev": true, "license": "MIT", "dependencies": { - "fast-deep-equal": "^3.1.3", - "fast-uri": "^3.0.1", - "json-schema-traverse": "^1.0.0", - "require-from-string": "^2.0.2" + "@babel/helper-plugin-utils": "^7.29.7" }, - "funding": { - "type": "github", - "url": "https://github.com/sponsors/epoberezkin" + "engines": { + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" } }, - "node_modules/@modelcontextprotocol/sdk/node_modules/json-schema-traverse": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz", - "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==", - "dev": true, - "license": "MIT" - }, - "node_modules/@napi-rs/canvas": { - "version": "0.1.76", - "resolved": "https://registry.npmjs.org/@napi-rs/canvas/-/canvas-0.1.76.tgz", - "integrity": "sha512-YIk5okeNN53GzjvWmAyCQFE9xrLeQXzYpudX4TiLvqaz9SqXgIgxIuKPe4DKyB5nccsQMIev7JGKTzZaN5rFdw==", + "node_modules/@babel/plugin-transform-object-rest-spread": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-object-rest-spread/-/plugin-transform-object-rest-spread-7.29.7.tgz", + "integrity": "sha512-Ld98jn4c0smUywL57m7SgsHq3OpThOa6LqZJif3G6jYOovPleoFhVrBJ1WegRApSFB2wu4+RelAj9AC9G08Z4A==", "dev": true, "license": "MIT", - "optional": true, - "workspaces": [ - "e2e/*" - ], + "dependencies": { + "@babel/helper-compilation-targets": "^7.29.7", + "@babel/helper-plugin-utils": "^7.29.7", + "@babel/plugin-transform-destructuring": "^7.29.7", + "@babel/plugin-transform-parameters": "^7.29.7", + "@babel/traverse": "^7.29.7" + }, "engines": { - "node": ">= 10" + "node": ">=6.9.0" }, - "optionalDependencies": { - "@napi-rs/canvas-android-arm64": "0.1.76", - "@napi-rs/canvas-darwin-arm64": "0.1.76", - "@napi-rs/canvas-darwin-x64": "0.1.76", - "@napi-rs/canvas-linux-arm-gnueabihf": "0.1.76", - "@napi-rs/canvas-linux-arm64-gnu": "0.1.76", - "@napi-rs/canvas-linux-arm64-musl": "0.1.76", - "@napi-rs/canvas-linux-riscv64-gnu": "0.1.76", - "@napi-rs/canvas-linux-x64-gnu": "0.1.76", - "@napi-rs/canvas-linux-x64-musl": "0.1.76", - "@napi-rs/canvas-win32-x64-msvc": "0.1.76" + "peerDependencies": { + "@babel/core": "^7.0.0-0" } }, - "node_modules/@napi-rs/canvas-android-arm64": { - "version": "0.1.76", - "resolved": "https://registry.npmjs.org/@napi-rs/canvas-android-arm64/-/canvas-android-arm64-0.1.76.tgz", - "integrity": "sha512-7EAfkLBQo2QoEzpHdInFbfEUYTXsiO2hvtFo1D9zfTzcQM8n5piZdOpJ3EIkmpe8yLoSV8HLyUQtq4bv11x6Tg==", - "cpu": [ - "arm64" - ], + "node_modules/@babel/plugin-transform-object-super": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-object-super/-/plugin-transform-object-super-7.29.7.tgz", + "integrity": "sha512-Ea/diGcw0twB5IlZPO5sgET6fJsLJqPABqTuFWIR+iMPGPZJkATEIWx0wa+aEQ5UY1CBQyP/gkAiLEqn1vBiQA==", "dev": true, "license": "MIT", - "optional": true, - "os": [ - "android" - ], + "dependencies": { + "@babel/helper-plugin-utils": "^7.29.7", + "@babel/helper-replace-supers": "^7.29.7" + }, "engines": { - "node": ">= 10" + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" } }, - "node_modules/@napi-rs/canvas-darwin-arm64": { - "version": "0.1.76", - "resolved": "https://registry.npmjs.org/@napi-rs/canvas-darwin-arm64/-/canvas-darwin-arm64-0.1.76.tgz", - "integrity": "sha512-Cs8WRMzaWSJWeWY8tvnCe+TuduHUbB0xFhZ0FmOrNy2prPxT4A6aU3FQu8hR9XJw8kKZ7v902wzaDmy9SdhG8A==", - "cpu": [ - "arm64" - ], + "node_modules/@babel/plugin-transform-optional-catch-binding": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-optional-catch-binding/-/plugin-transform-optional-catch-binding-7.29.7.tgz", + "integrity": "sha512-sLsyndxK2VwX6yNUOakMb7Sh553ZTe/vVM1XJ+9Z5aW1ytsc8xOIwmyk05NNjN60vkc5/KqoTH6hB4V41LJhng==", "dev": true, "license": "MIT", - "optional": true, - "os": [ - "darwin" - ], + "dependencies": { + "@babel/helper-plugin-utils": "^7.29.7" + }, "engines": { - "node": ">= 10" + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" } }, - "node_modules/@napi-rs/canvas-darwin-x64": { - "version": "0.1.76", - "resolved": "https://registry.npmjs.org/@napi-rs/canvas-darwin-x64/-/canvas-darwin-x64-0.1.76.tgz", - "integrity": "sha512-ya+T6gV9XAq7YAnMa2fKhWXAuRR5cpRny2IoHacoMxgtOARnUkJO/k3hIb52FtMoq7UxLi5+IFGVHU6ZiMu4Ag==", - "cpu": [ - "x64" - ], + "node_modules/@babel/plugin-transform-optional-chaining": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-optional-chaining/-/plugin-transform-optional-chaining-7.29.7.tgz", + "integrity": "sha512-6GM1dhvK3gNODkXcEcMCOLEDCLSoZ/sBbro2Ax8HURyasQ4NshagQixkRFdh5niI6E4gmA/jYI/4aT7rRos3ZQ==", "dev": true, "license": "MIT", - "optional": true, - "os": [ - "darwin" - ], + "dependencies": { + "@babel/helper-plugin-utils": "^7.29.7", + "@babel/helper-skip-transparent-expression-wrappers": "^7.29.7" + }, "engines": { - "node": ">= 10" + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" } }, - "node_modules/@napi-rs/canvas-linux-arm-gnueabihf": { - "version": "0.1.76", - "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm-gnueabihf/-/canvas-linux-arm-gnueabihf-0.1.76.tgz", - "integrity": "sha512-fgnPb+FKVuixACvkHGldJqYXExORBwvqGgL0K80uE6SGH2t0UKD2auHw2CtBy14DUzfg82PkupO2ix2w7kB+Xw==", - "cpu": [ - "arm" - ], + "node_modules/@babel/plugin-transform-parameters": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-parameters/-/plugin-transform-parameters-7.29.7.tgz", + "integrity": "sha512-ZDOBqV/qLYJI0YElr8DcENEyARsFQeESqWXH6gZlghYXuPPjvweuDhP4VyEi4BlUBlLRFZVjxoZDMjxhLW766g==", "dev": true, "license": "MIT", - "optional": true, - "os": [ - "linux" - ], + "dependencies": { + "@babel/helper-plugin-utils": "^7.29.7" + }, "engines": { - "node": ">= 10" + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" } }, - "node_modules/@napi-rs/canvas-linux-arm64-gnu": { - "version": "0.1.76", - "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm64-gnu/-/canvas-linux-arm64-gnu-0.1.76.tgz", - "integrity": "sha512-r8OxIenvBPOa4I014k1ZWTCz2dB0ZTsxMP7+ovMOKO7jkl1Z+YZo2OTAqxArpMhN0wdEeI3Lw9zUcn2HgwEgDA==", - "cpu": [ - "arm64" - ], + "node_modules/@babel/plugin-transform-private-methods": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-private-methods/-/plugin-transform-private-methods-7.29.7.tgz", + "integrity": "sha512-/6Rz4DK1ETDEM/bWHsPHcaEe7ZaT1EqSXjtSP/L0DijOYuaUhiRiOKcwpZ8P7zR4xXEHc2ITdiCgBm9Tpyv9ug==", "dev": true, "license": "MIT", - "optional": true, - "os": [ - "linux" - ], + "dependencies": { + "@babel/helper-create-class-features-plugin": "^7.29.7", + "@babel/helper-plugin-utils": "^7.29.7" + }, "engines": { - "node": ">= 10" + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" } }, - "node_modules/@napi-rs/canvas-linux-arm64-musl": { - "version": "0.1.76", - "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm64-musl/-/canvas-linux-arm64-musl-0.1.76.tgz", - "integrity": "sha512-smxwzKfHYaOYG7QXUuDPrFEC7WqjL3Lx4AM6mk8/FxDAS+8o0eoZJwSu+zXsaBLimEQUozEYgEGtJ2JJ0RdL4A==", - "cpu": [ - "arm64" - ], + "node_modules/@babel/plugin-transform-private-property-in-object": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-private-property-in-object/-/plugin-transform-private-property-in-object-7.29.7.tgz", + "integrity": "sha512-+BNo06dnrzdNNqCm1X6YUaVv0DKk8Q+JYcoZfOkLhYWNCXzlwTSRq8zGWayT1csjcpNXV9CQTBRRbmTLZac5cA==", "dev": true, "license": "MIT", - "optional": true, - "os": [ - "linux" - ], + "dependencies": { + "@babel/helper-annotate-as-pure": "^7.29.7", + "@babel/helper-create-class-features-plugin": "^7.29.7", + "@babel/helper-plugin-utils": "^7.29.7" + }, "engines": { - "node": ">= 10" + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" } }, - "node_modules/@napi-rs/canvas-linux-riscv64-gnu": { - "version": "0.1.76", - "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-riscv64-gnu/-/canvas-linux-riscv64-gnu-0.1.76.tgz", - "integrity": "sha512-G2PsFwsP+r4syEoNLStV3n1wtNAClwf8s/qB57bexG08R4f4WaiBd+x+d4iYS0Y5o90YIEm8/ewZn4bLIa0wNQ==", - "cpu": [ - "riscv64" - ], + "node_modules/@babel/plugin-transform-property-literals": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-property-literals/-/plugin-transform-property-literals-7.29.7.tgz", + "integrity": "sha512-bOMRLQuI0A5ZqHq3OWJ89/rXpJ/NJrbVhXiP4zwPGMs6kpcVsuTUNjwoE30K0Qm3mf48a/TnRYYD6vPNqcg6jA==", "dev": true, "license": "MIT", - "optional": true, - "os": [ - "linux" - ], + "dependencies": { + "@babel/helper-plugin-utils": "^7.29.7" + }, "engines": { - "node": ">= 10" + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" } }, - "node_modules/@napi-rs/canvas-linux-x64-gnu": { - "version": "0.1.76", - "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-x64-gnu/-/canvas-linux-x64-gnu-0.1.76.tgz", - "integrity": "sha512-SNK+vgge4DnuONYdYE3Y09LivGgUiUPQDU+PdGNZJIzIi0hRDLcA59eag8LGeQfPmJW84c1aZD04voihybKFog==", - "cpu": [ - "x64" - ], + "node_modules/@babel/plugin-transform-regenerator": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-regenerator/-/plugin-transform-regenerator-7.29.7.tgz", + "integrity": "sha512-rNNFV0DBAJp988xW2DOntfDoYn1eR8GGF5AT5vYc+rjyfaQkM242c9tZUHHPe7KYaiJizXPWhQTzzdbXySyhBw==", "dev": true, "license": "MIT", - "optional": true, - "os": [ - "linux" - ], + "dependencies": { + "@babel/helper-plugin-utils": "^7.29.7" + }, "engines": { - "node": ">= 10" + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" } }, - "node_modules/@napi-rs/canvas-linux-x64-musl": { - "version": "0.1.76", - "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-x64-musl/-/canvas-linux-x64-musl-0.1.76.tgz", - "integrity": "sha512-tWHLBI9iVoR1NsfpHz1MGERTkqcca8akbH/CzX6JQUNC+lJOeYYXeRuK8hKqMIg1LI+4QOMAtHNVeZu8NvjEug==", - "cpu": [ - "x64" - ], + "node_modules/@babel/plugin-transform-regexp-modifiers": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-regexp-modifiers/-/plugin-transform-regexp-modifiers-7.29.7.tgz", + "integrity": "sha512-mB5Fs0VWrJ42ZCmc8114v60qetdaUVNkj9PmSZRmanCZM3S9hm0CFRLjRmYIsuXav14l2jvZ+4T8iiCGnhj3nQ==", "dev": true, "license": "MIT", - "optional": true, - "os": [ - "linux" - ], + "dependencies": { + "@babel/helper-create-regexp-features-plugin": "^7.29.7", + "@babel/helper-plugin-utils": "^7.29.7" + }, "engines": { - "node": ">= 10" + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0" } }, - "node_modules/@napi-rs/canvas-win32-x64-msvc": { - "version": "0.1.76", - "resolved": "https://registry.npmjs.org/@napi-rs/canvas-win32-x64-msvc/-/canvas-win32-x64-msvc-0.1.76.tgz", - "integrity": "sha512-ifM5HOGw2hP5QLQzCB41Riw3Pq5yKAAjZpn+lJC0sYBmyS2s/Kq6KpTOKxf0CuptkI1wMcRcYQfhLRdeWiYvIg==", - "cpu": [ - "x64" - ], + "node_modules/@babel/plugin-transform-reserved-words": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-reserved-words/-/plugin-transform-reserved-words-7.29.7.tgz", + "integrity": "sha512-5+YhdpVgmfSmwZyLMftfaiffLRMHjzIRHFHHLdibcSyJm2pasMrKHrO3Ptrt2DRshjvpgjEJJ1zVW14WPq/6QA==", "dev": true, "license": "MIT", - "optional": true, - "os": [ - "win32" - ], + "dependencies": { + "@babel/helper-plugin-utils": "^7.29.7" + }, "engines": { - "node": ">= 10" + "node": ">=6.9.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0-0" } }, - "node_modules/@neoconfetti/react": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/@neoconfetti/react/-/react-1.0.0.tgz", - "integrity": "sha512-klcSooChXXOzIm+SE5IISIAn3bYzYfPjbX7D7HoqZL84oAfgREeSg5vSIaSFH+DaGzzvImTyWe1OyrJ67vik4A==", - "dev": true, - "license": "MIT" - }, - "node_modules/@parcel/watcher": { - "version": "2.5.1", - "resolved": "https://registry.npmjs.org/@parcel/watcher/-/watcher-2.5.1.tgz", - "integrity": "sha512-dfUnCxiN9H4ap84DvD2ubjw+3vUNpstxa0TneY/Paat8a3R4uQZDLSvWjmznAY/DoahqTHl9V46HF/Zs3F29pg==", + "node_modules/@babel/plugin-transform-shorthand-properties": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-shorthand-properties/-/plugin-transform-shorthand-properties-7.29.7.tgz", + "integrity": "sha512-I+WYbGBAiCn7nA6xBrlgPH+MB7HWb4u8pv5S0Pv7OtwNvIFvCCb24YlttKEeUFVurfBCEaOTnuhlqsb7f0Z5Dg==", "dev": true, - "hasInstallScript": true, "license": "MIT", - "optional": true, "dependencies": { - "detect-libc": "^1.0.3", - "is-glob": "^4.0.3", - "micromatch": "^4.0.5", - "node-addon-api": "^7.0.0" + "@babel/helper-plugin-utils": "^7.29.7" }, "engines": { - "node": ">= 10.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/parcel" + "node": ">=6.9.0" }, - "optionalDependencies": { - "@parcel/watcher-android-arm64": "2.5.1", - "@parcel/watcher-darwin-arm64": "2.5.1", - "@parcel/watcher-darwin-x64": "2.5.1", - "@parcel/watcher-freebsd-x64": "2.5.1", - "@parcel/watcher-linux-arm-glibc": "2.5.1", - "@parcel/watcher-linux-arm-musl": "2.5.1", - "@parcel/watcher-linux-arm64-glibc": "2.5.1", - "@parcel/watcher-linux-arm64-musl": "2.5.1", - "@parcel/watcher-linux-x64-glibc": "2.5.1", - "@parcel/watcher-linux-x64-musl": "2.5.1", - "@parcel/watcher-win32-arm64": "2.5.1", - "@parcel/watcher-win32-ia32": "2.5.1", - "@parcel/watcher-win32-x64": "2.5.1" + "peerDependencies": { + "@babel/core": "^7.0.0-0" } }, - "node_modules/@parcel/watcher-android-arm64": { - "version": "2.5.1", - "resolved": "https://registry.npmjs.org/@parcel/watcher-android-arm64/-/watcher-android-arm64-2.5.1.tgz", - "integrity": "sha512-KF8+j9nNbUN8vzOFDpRMsaKBHZ/mcjEjMToVMJOhTozkDonQFFrRcfdLWn6yWKCmJKmdVxSgHiYvTCef4/qcBA==", - "cpu": [ - "arm64" - ], + "node_modules/@babel/plugin-transform-spread": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-spread/-/plugin-transform-spread-7.29.7.tgz", + "integrity": "sha512-/u5K1QWada7tbYNqTjMh96718g9NTwh9tfPJMsSmVsQwGT447FskV+KcfeXkXq2GWki4EM/MuTdmBec+hOuVTQ==", "dev": true, "license": "MIT", - "optional": true, - "os": [ - "android" - ], + "dependencies": { + "@babel/helper-plugin-utils": "^7.29.7", + "@babel/helper-skip-transparent-expression-wrappers": "^7.29.7" + }, "engines": { - "node": ">= 10.0.0" + "node": ">=6.9.0" }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/parcel" + "peerDependencies": { + "@babel/core": "^7.0.0-0" } }, - "node_modules/@parcel/watcher-darwin-arm64": { - "version": "2.5.1", - "resolved": "https://registry.npmjs.org/@parcel/watcher-darwin-arm64/-/watcher-darwin-arm64-2.5.1.tgz", - "integrity": "sha512-eAzPv5osDmZyBhou8PoF4i6RQXAfeKL9tjb3QzYuccXFMQU0ruIc/POh30ePnaOyD1UXdlKguHBmsTs53tVoPw==", - "cpu": [ - "arm64" - ], + "node_modules/@babel/plugin-transform-sticky-regex": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-sticky-regex/-/plugin-transform-sticky-regex-7.29.7.tgz", + "integrity": "sha512-BCHzNYJGe9l7EpwwDBN/ztlL2NYFFq8hp9ddjtUEM9f2O7S7kKV/lL6Fwo7IF7NSkYhPK2vO+86nIGltA90MsA==", "dev": true, "license": "MIT", - "optional": true, - "os": [ - "darwin" - ], + "dependencies": { + "@babel/helper-plugin-utils": "^7.29.7" + }, "engines": { - "node": ">= 10.0.0" + "node": ">=6.9.0" }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/parcel" + "peerDependencies": { + "@babel/core": "^7.0.0-0" } }, - "node_modules/@parcel/watcher-darwin-x64": { - "version": "2.5.1", - "resolved": "https://registry.npmjs.org/@parcel/watcher-darwin-x64/-/watcher-darwin-x64-2.5.1.tgz", - "integrity": "sha512-1ZXDthrnNmwv10A0/3AJNZ9JGlzrF82i3gNQcWOzd7nJ8aj+ILyW1MTxVk35Db0u91oD5Nlk9MBiujMlwmeXZg==", - "cpu": [ - "x64" - ], + "node_modules/@babel/plugin-transform-template-literals": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-template-literals/-/plugin-transform-template-literals-7.29.7.tgz", + "integrity": "sha512-NCSEJ4sLFU2gqAub45HYh4fus2yQ36rr6ei6vpU7NdoJqCpxvEG8E6eJpscGyXP3VHD2Ny+fSXr04k1hoUrFqA==", "dev": true, "license": "MIT", - "optional": true, - "os": [ - "darwin" - ], + "dependencies": { + "@babel/helper-plugin-utils": "^7.29.7" + }, "engines": { - "node": ">= 10.0.0" + "node": ">=6.9.0" }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/parcel" + "peerDependencies": { + "@babel/core": "^7.0.0-0" } }, - "node_modules/@parcel/watcher-freebsd-x64": { - "version": "2.5.1", - "resolved": "https://registry.npmjs.org/@parcel/watcher-freebsd-x64/-/watcher-freebsd-x64-2.5.1.tgz", - "integrity": "sha512-SI4eljM7Flp9yPuKi8W0ird8TI/JK6CSxju3NojVI6BjHsTyK7zxA9urjVjEKJ5MBYC+bLmMcbAWlZ+rFkLpJQ==", - "cpu": [ - "x64" - ], + "node_modules/@babel/plugin-transform-typeof-symbol": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-typeof-symbol/-/plugin-transform-typeof-symbol-7.29.7.tgz", + "integrity": "sha512-223mNGoTkBiTEWFoK+Q6Go3tueMRclO8vxxxxquNCYuNI4jWOofFKJRRDu6SDrB8Sgo1UEGW9T4GAQ8ZyRso1A==", "dev": true, "license": "MIT", - "optional": true, - "os": [ - "freebsd" - ], + "dependencies": { + "@babel/helper-plugin-utils": "^7.29.7" + }, "engines": { - "node": ">= 10.0.0" + "node": ">=6.9.0" }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/parcel" + "peerDependencies": { + "@babel/core": "^7.0.0-0" } }, - "node_modules/@parcel/watcher-linux-arm-glibc": { - "version": "2.5.1", - "resolved": "https://registry.npmjs.org/@parcel/watcher-linux-arm-glibc/-/watcher-linux-arm-glibc-2.5.1.tgz", - "integrity": "sha512-RCdZlEyTs8geyBkkcnPWvtXLY44BCeZKmGYRtSgtwwnHR4dxfHRG3gR99XdMEdQ7KeiDdasJwwvNSF5jKtDwdA==", - "cpu": [ - "arm" - ], + "node_modules/@babel/plugin-transform-unicode-escapes": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-unicode-escapes/-/plugin-transform-unicode-escapes-7.29.7.tgz", + "integrity": "sha512-jCfXxSjf94lf4E0hKE0AByxF6F3/pVFqRdUUNkDJhsY0m1ZKjnN6ZYyMeHNpzflxb/0q5b7t3p+BE+SLF1WOtA==", "dev": true, "license": "MIT", - "optional": true, - "os": [ - "linux" - ], + "dependencies": { + "@babel/helper-plugin-utils": "^7.29.7" + }, "engines": { - "node": ">= 10.0.0" + "node": ">=6.9.0" }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/parcel" + "peerDependencies": { + "@babel/core": "^7.0.0-0" } }, - "node_modules/@parcel/watcher-linux-arm-musl": { - "version": "2.5.1", - "resolved": "https://registry.npmjs.org/@parcel/watcher-linux-arm-musl/-/watcher-linux-arm-musl-2.5.1.tgz", - "integrity": "sha512-6E+m/Mm1t1yhB8X412stiKFG3XykmgdIOqhjWj+VL8oHkKABfu/gjFj8DvLrYVHSBNC+/u5PeNrujiSQ1zwd1Q==", - "cpu": [ - "arm" - ], + "node_modules/@babel/plugin-transform-unicode-property-regex": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-unicode-property-regex/-/plugin-transform-unicode-property-regex-7.29.7.tgz", + "integrity": "sha512-OgZ+zoAJgZLUCunsTRQ5LAjOywDv5zzZ2/hQ5aMw1pGXyY2rtE8/chXYUmu3AlVHKpm10KEdG9aMwbI/K76ZGw==", "dev": true, "license": "MIT", - "optional": true, - "os": [ - "linux" - ], + "dependencies": { + "@babel/helper-create-regexp-features-plugin": "^7.29.7", + "@babel/helper-plugin-utils": "^7.29.7" + }, "engines": { - "node": ">= 10.0.0" + "node": ">=6.9.0" }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/parcel" + "peerDependencies": { + "@babel/core": "^7.0.0-0" } }, - "node_modules/@parcel/watcher-linux-arm64-glibc": { - "version": "2.5.1", - "resolved": "https://registry.npmjs.org/@parcel/watcher-linux-arm64-glibc/-/watcher-linux-arm64-glibc-2.5.1.tgz", - "integrity": "sha512-LrGp+f02yU3BN9A+DGuY3v3bmnFUggAITBGriZHUREfNEzZh/GO06FF5u2kx8x+GBEUYfyTGamol4j3m9ANe8w==", - "cpu": [ - "arm64" - ], + "node_modules/@babel/plugin-transform-unicode-regex": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-unicode-regex/-/plugin-transform-unicode-regex-7.29.7.tgz", + "integrity": "sha512-7D/x/23/d/3VqZ0QA+LGbZMlGwZjztBygSWWWsfTPoQ1oQ6Q1P6Mr3d0kk42XabyUVw+fha3LqdRsFqeKqvCyA==", "dev": true, "license": "MIT", - "optional": true, - "os": [ - "linux" - ], + "dependencies": { + "@babel/helper-create-regexp-features-plugin": "^7.29.7", + "@babel/helper-plugin-utils": "^7.29.7" + }, "engines": { - "node": ">= 10.0.0" + "node": ">=6.9.0" }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/parcel" + "peerDependencies": { + "@babel/core": "^7.0.0-0" } }, - "node_modules/@parcel/watcher-linux-arm64-musl": { - "version": "2.5.1", - "resolved": "https://registry.npmjs.org/@parcel/watcher-linux-arm64-musl/-/watcher-linux-arm64-musl-2.5.1.tgz", - "integrity": "sha512-cFOjABi92pMYRXS7AcQv9/M1YuKRw8SZniCDw0ssQb/noPkRzA+HBDkwmyOJYp5wXcsTrhxO0zq1U11cK9jsFg==", - "cpu": [ - "arm64" - ], + "node_modules/@babel/plugin-transform-unicode-sets-regex": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/plugin-transform-unicode-sets-regex/-/plugin-transform-unicode-sets-regex-7.29.7.tgz", + "integrity": "sha512-BLOhLht9DOJwIxlmp91wHvkXv1lguuHS3/FwUO8HL1H0u8s4hR1gASVFyilu9iGtcTRYqjTZmlsFFeQletntEg==", "dev": true, "license": "MIT", - "optional": true, - "os": [ - "linux" - ], + "dependencies": { + "@babel/helper-create-regexp-features-plugin": "^7.29.7", + "@babel/helper-plugin-utils": "^7.29.7" + }, "engines": { - "node": ">= 10.0.0" + "node": ">=6.9.0" }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/parcel" + "peerDependencies": { + "@babel/core": "^7.0.0" } }, - "node_modules/@parcel/watcher-linux-x64-glibc": { - "version": "2.5.1", - "resolved": "https://registry.npmjs.org/@parcel/watcher-linux-x64-glibc/-/watcher-linux-x64-glibc-2.5.1.tgz", - "integrity": "sha512-GcESn8NZySmfwlTsIur+49yDqSny2IhPeZfXunQi48DMugKeZ7uy1FX83pO0X22sHntJ4Ub+9k34XQCX+oHt2A==", - "cpu": [ - "x64" - ], - "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "linux" - ], + "node_modules/@babel/preset-env": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/preset-env/-/preset-env-7.29.7.tgz", + "integrity": "sha512-GYzX36n1nsciIb0uyH0GHwxwtNwPQIcpxSeiVLDtG/B7jB5xXgchnmL1f/jCX5o+pwnaDBtO60ONSJhEBJfxYA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/compat-data": "^7.29.7", + "@babel/helper-compilation-targets": "^7.29.7", + "@babel/helper-plugin-utils": "^7.29.7", + "@babel/helper-validator-option": "^7.29.7", + "@babel/plugin-bugfix-firefox-class-in-computed-class-key": "^7.29.7", + "@babel/plugin-bugfix-safari-class-field-initializer-scope": "^7.29.7", + "@babel/plugin-bugfix-safari-id-destructuring-collision-in-function-expression": "^7.29.7", + "@babel/plugin-bugfix-safari-rest-destructuring-rhs-array": "^7.29.7", + "@babel/plugin-bugfix-v8-spread-parameters-in-optional-chaining": "^7.29.7", + "@babel/plugin-bugfix-v8-static-class-fields-redefine-readonly": "^7.29.7", + "@babel/plugin-proposal-private-property-in-object": "7.21.0-placeholder-for-preset-env.2", + "@babel/plugin-syntax-import-assertions": "^7.29.7", + "@babel/plugin-syntax-import-attributes": "^7.29.7", + "@babel/plugin-syntax-unicode-sets-regex": "^7.18.6", + "@babel/plugin-transform-arrow-functions": "^7.29.7", + "@babel/plugin-transform-async-generator-functions": "^7.29.7", + "@babel/plugin-transform-async-to-generator": "^7.29.7", + "@babel/plugin-transform-block-scoped-functions": "^7.29.7", + "@babel/plugin-transform-block-scoping": "^7.29.7", + "@babel/plugin-transform-class-properties": "^7.29.7", + "@babel/plugin-transform-class-static-block": "^7.29.7", + "@babel/plugin-transform-classes": "^7.29.7", + "@babel/plugin-transform-computed-properties": "^7.29.7", + "@babel/plugin-transform-destructuring": "^7.29.7", + "@babel/plugin-transform-dotall-regex": "^7.29.7", + "@babel/plugin-transform-duplicate-keys": "^7.29.7", + "@babel/plugin-transform-duplicate-named-capturing-groups-regex": "^7.29.7", + "@babel/plugin-transform-dynamic-import": "^7.29.7", + "@babel/plugin-transform-explicit-resource-management": "^7.29.7", + "@babel/plugin-transform-exponentiation-operator": "^7.29.7", + "@babel/plugin-transform-export-namespace-from": "^7.29.7", + "@babel/plugin-transform-for-of": "^7.29.7", + "@babel/plugin-transform-function-name": "^7.29.7", + "@babel/plugin-transform-json-strings": "^7.29.7", + "@babel/plugin-transform-literals": "^7.29.7", + "@babel/plugin-transform-logical-assignment-operators": "^7.29.7", + "@babel/plugin-transform-member-expression-literals": "^7.29.7", + "@babel/plugin-transform-modules-amd": "^7.29.7", + "@babel/plugin-transform-modules-commonjs": "^7.29.7", + "@babel/plugin-transform-modules-systemjs": "^7.29.7", + "@babel/plugin-transform-modules-umd": "^7.29.7", + "@babel/plugin-transform-named-capturing-groups-regex": "^7.29.7", + "@babel/plugin-transform-new-target": "^7.29.7", + "@babel/plugin-transform-nullish-coalescing-operator": "^7.29.7", + "@babel/plugin-transform-numeric-separator": "^7.29.7", + "@babel/plugin-transform-object-rest-spread": "^7.29.7", + "@babel/plugin-transform-object-super": "^7.29.7", + "@babel/plugin-transform-optional-catch-binding": "^7.29.7", + "@babel/plugin-transform-optional-chaining": "^7.29.7", + "@babel/plugin-transform-parameters": "^7.29.7", + "@babel/plugin-transform-private-methods": "^7.29.7", + "@babel/plugin-transform-private-property-in-object": "^7.29.7", + "@babel/plugin-transform-property-literals": "^7.29.7", + "@babel/plugin-transform-regenerator": "^7.29.7", + "@babel/plugin-transform-regexp-modifiers": "^7.29.7", + "@babel/plugin-transform-reserved-words": "^7.29.7", + "@babel/plugin-transform-shorthand-properties": "^7.29.7", + "@babel/plugin-transform-spread": "^7.29.7", + "@babel/plugin-transform-sticky-regex": "^7.29.7", + "@babel/plugin-transform-template-literals": "^7.29.7", + "@babel/plugin-transform-typeof-symbol": "^7.29.7", + "@babel/plugin-transform-unicode-escapes": "^7.29.7", + "@babel/plugin-transform-unicode-property-regex": "^7.29.7", + "@babel/plugin-transform-unicode-regex": "^7.29.7", + "@babel/plugin-transform-unicode-sets-regex": "^7.29.7", + "@babel/preset-modules": "0.1.6-no-external-plugins", + "babel-plugin-polyfill-corejs2": "^0.4.15", + "babel-plugin-polyfill-corejs3": "^0.14.0", + "babel-plugin-polyfill-regenerator": "^0.6.6", + "core-js-compat": "^3.48.0", + "semver": "^6.3.1" + }, "engines": { - "node": ">= 10.0.0" + "node": ">=6.9.0" }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/parcel" + "peerDependencies": { + "@babel/core": "^7.0.0-0" } }, - "node_modules/@parcel/watcher-linux-x64-musl": { - "version": "2.5.1", - "resolved": "https://registry.npmjs.org/@parcel/watcher-linux-x64-musl/-/watcher-linux-x64-musl-2.5.1.tgz", - "integrity": "sha512-n0E2EQbatQ3bXhcH2D1XIAANAcTZkQICBPVaxMeaCVBtOpBZpWJuf7LwyWPSBDITb7In8mqQgJ7gH8CILCURXg==", - "cpu": [ - "x64" - ], + "node_modules/@babel/preset-env/node_modules/semver": { + "version": "6.3.1", + "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz", + "integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==", "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "linux" - ], - "engines": { - "node": ">= 10.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/parcel" + "license": "ISC", + "bin": { + "semver": "bin/semver.js" } }, - "node_modules/@parcel/watcher-win32-arm64": { - "version": "2.5.1", - "resolved": "https://registry.npmjs.org/@parcel/watcher-win32-arm64/-/watcher-win32-arm64-2.5.1.tgz", - "integrity": "sha512-RFzklRvmc3PkjKjry3hLF9wD7ppR4AKcWNzH7kXR7GUe0Igb3Nz8fyPwtZCSquGrhU5HhUNDr/mKBqj7tqA2Vw==", - "cpu": [ - "arm64" - ], + "node_modules/@babel/preset-modules": { + "version": "0.1.6-no-external-plugins", + "resolved": "https://registry.npmjs.org/@babel/preset-modules/-/preset-modules-0.1.6-no-external-plugins.tgz", + "integrity": "sha512-HrcgcIESLm9aIR842yhJ5RWan/gebQUJ6E/E5+rf0y9o6oj7w0Br+sWuL6kEQ/o/AdfvR1Je9jG18/gnpwjEyA==", "dev": true, "license": "MIT", - "optional": true, - "os": [ - "win32" - ], - "engines": { - "node": ">= 10.0.0" + "dependencies": { + "@babel/helper-plugin-utils": "^7.0.0", + "@babel/types": "^7.4.4", + "esutils": "^2.0.2" }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/parcel" + "peerDependencies": { + "@babel/core": "^7.0.0-0 || ^8.0.0-0 <8.0.0" } }, - "node_modules/@parcel/watcher-win32-ia32": { - "version": "2.5.1", - "resolved": "https://registry.npmjs.org/@parcel/watcher-win32-ia32/-/watcher-win32-ia32-2.5.1.tgz", - "integrity": "sha512-c2KkcVN+NJmuA7CGlaGD1qJh1cLfDnQsHjE89E60vUEMlqduHGCdCLJCID5geFVM0dOtA3ZiIO8BoEQmzQVfpQ==", - "cpu": [ - "ia32" - ], + "node_modules/@babel/runtime": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.29.7.tgz", + "integrity": "sha512-Nq8OhGWiZIZGV6hLHoyAKLLcJihP/xFeBMGJoUrxTX2psI8dCifzLhZISFb+VWS3wFMRDmCGw5R+dOySCqPLhw==", "dev": true, "license": "MIT", - "optional": true, - "os": [ - "win32" - ], "engines": { - "node": ">= 10.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/parcel" + "node": ">=6.9.0" } }, - "node_modules/@parcel/watcher-win32-x64": { - "version": "2.5.1", - "resolved": "https://registry.npmjs.org/@parcel/watcher-win32-x64/-/watcher-win32-x64-2.5.1.tgz", - "integrity": "sha512-9lHBdJITeNR++EvSQVUcaZoWupyHfXe1jZvGZ06O/5MflPcuPLtEphScIBL+AiCWBO46tDSHzWyD0uDmmZqsgA==", - "cpu": [ - "x64" - ], + "node_modules/@babel/template": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/template/-/template-7.29.7.tgz", + "integrity": "sha512-puq+Gf35oI24FeN11LkoUQFqv9uwNeWpxXZi/Ji3rRIoKAzKnxRaZ+Gkj0vKS9ZCiTESfng1N9LyOyXvo+m+Gg==", "dev": true, "license": "MIT", - "optional": true, - "os": [ - "win32" - ], - "engines": { - "node": ">= 10.0.0" + "dependencies": { + "@babel/code-frame": "^7.29.7", + "@babel/parser": "^7.29.7", + "@babel/types": "^7.29.7" }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/parcel" + "engines": { + "node": ">=6.9.0" } }, - "node_modules/@parcel/watcher/node_modules/detect-libc": { - "version": "1.0.3", - "resolved": "https://registry.npmjs.org/detect-libc/-/detect-libc-1.0.3.tgz", - "integrity": "sha512-pGjwhsmsp4kL2RTz08wcOlGN83otlqHeD/Z5T8GXZB+/YcpQ/dgo+lbU8ZsGxV0HIvqqxo9l7mqYwyYMD9bKDg==", + "node_modules/@babel/traverse": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/traverse/-/traverse-7.29.7.tgz", + "integrity": "sha512-EhlfNQtZ+NK22w5BM61ciuiq1m58ed33Wr1Xan//ZRTy6hgjnwyCffRYwzsGXdASJSUJ1guZILsErh1eQcl+zw==", "dev": true, - "license": "Apache-2.0", - "optional": true, - "bin": { - "detect-libc": "bin/detect-libc.js" + "license": "MIT", + "dependencies": { + "@babel/code-frame": "^7.29.7", + "@babel/generator": "^7.29.7", + "@babel/helper-globals": "^7.29.7", + "@babel/parser": "^7.29.7", + "@babel/template": "^7.29.7", + "@babel/types": "^7.29.7", + "debug": "^4.3.1" }, "engines": { - "node": ">=0.10" + "node": ">=6.9.0" } }, - "node_modules/@playwright/test": { - "version": "1.56.1", - "resolved": "https://registry.npmjs.org/@playwright/test/-/test-1.56.1.tgz", - "integrity": "sha512-vSMYtL/zOcFpvJCW71Q/OEGQb7KYBPAdKh35WNSkaZA75JlAO8ED8UN6GUNTm3drWomcbcqRPFqQbLae8yBTdg==", + "node_modules/@babel/types": { + "version": "7.29.7", + "resolved": "https://registry.npmjs.org/@babel/types/-/types-7.29.7.tgz", + "integrity": "sha512-4zBIxpPzowiZpusoFkyGVwakdRJUyuH5PxQ/PrqghfdFWWasvnCdPfQXHrenDai+gyLARulZjZowCOj6fjT4pA==", "dev": true, - "license": "Apache-2.0", + "license": "MIT", "dependencies": { - "playwright": "1.56.1" - }, - "bin": { - "playwright": "cli.js" + "@babel/helper-string-parser": "^7.29.7", + "@babel/helper-validator-identifier": "^7.29.7" }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@bcoe/v8-coverage": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/@bcoe/v8-coverage/-/v8-coverage-1.0.2.tgz", + "integrity": "sha512-6zABk/ECA/QYSCQ1NGiVwwbQerUCZ+TQbp64Q3AgmfNvurHH0j8TtXa1qbShXA6qqkpAj4V5W8pP6mLe1mcMqA==", + "dev": true, + "license": "MIT", "engines": { "node": ">=18" } }, - "node_modules/@polka/url": { - "version": "1.0.0-next.29", - "resolved": "https://registry.npmjs.org/@polka/url/-/url-1.0.0-next.29.tgz", - "integrity": "sha512-wwQAWhWSuHaag8c4q/KN/vCoeOJYshAIvMQwD4GpSb3OiZklFfvAgmj0VCBBImRpuF/aFgIRzllXlVX93Jevww==", + "node_modules/@blazediff/core": { + "version": "1.9.1", + "resolved": "https://registry.npmjs.org/@blazediff/core/-/core-1.9.1.tgz", + "integrity": "sha512-ehg3jIkYKulZh+8om/O25vkvSsXXwC+skXmyA87FFx6A/45eqOkZsBltMw/TVteb0mloiGT8oGRTcjRAz66zaA==", "dev": true, "license": "MIT" }, - "node_modules/@rollup/rollup-android-arm-eabi": { - "version": "4.60.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.60.1.tgz", - "integrity": "sha512-d6FinEBLdIiK+1uACUttJKfgZREXrF0Qc2SmLII7W2AD8FfiZ9Wjd+rD/iRuf5s5dWrr1GgwXCvPqOuDquOowA==", - "cpu": [ - "arm" - ], + "node_modules/@braintree/sanitize-url": { + "version": "7.1.2", + "resolved": "https://registry.npmjs.org/@braintree/sanitize-url/-/sanitize-url-7.1.2.tgz", + "integrity": "sha512-jigsZK+sMF/cuiB7sERuo9V7N9jx+dhmHHnQyDSVdpZwVutaBu7WvNYqMDLSgFgfB30n452TP3vjDAvFC973mA==", + "dev": true, + "license": "MIT" + }, + "node_modules/@canvas/image-data": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@canvas/image-data/-/image-data-1.1.0.tgz", + "integrity": "sha512-QdObRRjRbcXGmM1tmJ+MrHcaz1MftF2+W7YI+MsphnsCrmtyfS0d5qJbk0MeSbUeyM/jCb0hmnkXPsy026L7dA==", + "dev": true, + "license": "MIT" + }, + "node_modules/@chevrotain/types": { + "version": "11.1.2", + "resolved": "https://registry.npmjs.org/@chevrotain/types/-/types-11.1.2.tgz", + "integrity": "sha512-U+HFai5+zmJCkK86QsaJtoITlboZHBqrVketcO2ROv865xfCMSFpELQoz1GkX5GzME8pTa+3kbKrZHQtI0gdbw==", + "dev": true, + "license": "Apache-2.0" + }, + "node_modules/@chromatic-com/storybook": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/@chromatic-com/storybook/-/storybook-5.0.0.tgz", + "integrity": "sha512-8wUsqL8kg6R5ue8XNE7Jv/iD1SuE4+6EXMIGIuE+T2loBITEACLfC3V8W44NJviCLusZRMWbzICddz0nU0bFaw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@neoconfetti/react": "^1.0.0", + "chromatic": "^13.3.4", + "filesize": "^10.0.12", + "jsonfile": "^6.1.0", + "strip-ansi": "^7.1.0" + }, + "engines": { + "node": ">=20.0.0", + "yarn": ">=1.22.18" + }, + "peerDependencies": { + "storybook": "^0.0.0-0 || ^10.1.0 || ^10.1.0-0 || ^10.2.0-0 || ^10.3.0-0" + } + }, + "node_modules/@emnapi/core": { + "version": "1.9.2", + "resolved": "https://registry.npmjs.org/@emnapi/core/-/core-1.9.2.tgz", + "integrity": "sha512-UC+ZhH3XtczQYfOlu3lNEkdW/p4dsJ1r/bP7H8+rhao3TTTMO1ATq/4DdIi23XuGoFY+Cz0JmCbdVl0hz9jZcA==", "dev": true, "license": "MIT", "optional": true, - "os": [ - "android" - ] + "dependencies": { + "@emnapi/wasi-threads": "1.2.1", + "tslib": "^2.4.0" + } }, - "node_modules/@rollup/rollup-android-arm64": { - "version": "4.60.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm64/-/rollup-android-arm64-4.60.1.tgz", - "integrity": "sha512-YjG/EwIDvvYI1YvYbHvDz/BYHtkY4ygUIXHnTdLhG+hKIQFBiosfWiACWortsKPKU/+dUwQQCKQM3qrDe8c9BA==", - "cpu": [ - "arm64" - ], + "node_modules/@emnapi/runtime": { + "version": "1.10.0", + "resolved": "https://registry.npmjs.org/@emnapi/runtime/-/runtime-1.10.0.tgz", + "integrity": "sha512-ewvYlk86xUoGI0zQRNq/mC+16R1QeDlKQy21Ki3oSYXNgLb45GV1P6A0M+/s6nyCuNDqe5VpaY84BzXGwVbwFA==", "dev": true, "license": "MIT", "optional": true, - "os": [ - "android" - ] + "dependencies": { + "tslib": "^2.4.0" + } }, - "node_modules/@rollup/rollup-darwin-arm64": { - "version": "4.60.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-arm64/-/rollup-darwin-arm64-4.60.1.tgz", - "integrity": "sha512-mjCpF7GmkRtSJwon+Rq1N8+pI+8l7w5g9Z3vWj4T7abguC4Czwi3Yu/pFaLvA3TTeMVjnu3ctigusqWUfjZzvw==", - "cpu": [ - "arm64" - ], + "node_modules/@emnapi/wasi-threads": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/@emnapi/wasi-threads/-/wasi-threads-1.2.1.tgz", + "integrity": "sha512-uTII7OYF+/Mes/MrcIOYp5yOtSMLBWSIoLPpcgwipoiKbli6k322tcoFsxoIIxPDqW01SQGAgko4EzZi2BNv2w==", "dev": true, "license": "MIT", "optional": true, - "os": [ - "darwin" - ] + "dependencies": { + "tslib": "^2.4.0" + } }, - "node_modules/@rollup/rollup-darwin-x64": { - "version": "4.60.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-x64/-/rollup-darwin-x64-4.60.1.tgz", - "integrity": "sha512-haZ7hJ1JT4e9hqkoT9R/19XW2QKqjfJVv+i5AGg57S+nLk9lQnJ1F/eZloRO3o9Scy9CM3wQ9l+dkXtcBgN5Ew==", + "node_modules/@esbuild/aix-ppc64": { + "version": "0.27.7", + "resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.27.7.tgz", + "integrity": "sha512-EKX3Qwmhz1eMdEJokhALr0YiD0lhQNwDqkPYyPhiSwKrh7/4KRjQc04sZ8db+5DVVnZ1LmbNDI1uAMPEUBnQPg==", "cpu": [ - "x64" + "ppc64" ], "dev": true, "license": "MIT", "optional": true, "os": [ - "darwin" - ] + "aix" + ], + "engines": { + "node": ">=18" + } }, - "node_modules/@rollup/rollup-freebsd-arm64": { - "version": "4.60.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-arm64/-/rollup-freebsd-arm64-4.60.1.tgz", - "integrity": "sha512-czw90wpQq3ZsAVBlinZjAYTKduOjTywlG7fEeWKUA7oCmpA8xdTkxZZlwNJKWqILlq0wehoZcJYfBvOyhPTQ6w==", + "node_modules/@esbuild/android-arm": { + "version": "0.27.7", + "resolved": "https://registry.npmjs.org/@esbuild/android-arm/-/android-arm-0.27.7.tgz", + "integrity": "sha512-jbPXvB4Yj2yBV7HUfE2KHe4GJX51QplCN1pGbYjvsyCZbQmies29EoJbkEc+vYuU5o45AfQn37vZlyXy4YJ8RQ==", "cpu": [ - "arm64" + "arm" ], "dev": true, "license": "MIT", "optional": true, "os": [ - "freebsd" - ] + "android" + ], + "engines": { + "node": ">=18" + } }, - "node_modules/@rollup/rollup-freebsd-x64": { - "version": "4.60.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-x64/-/rollup-freebsd-x64-4.60.1.tgz", - "integrity": "sha512-KVB2rqsxTHuBtfOeySEyzEOB7ltlB/ux38iu2rBQzkjbwRVlkhAGIEDiiYnO2kFOkJp+Z7pUXKyrRRFuFUKt+g==", + "node_modules/@esbuild/android-arm64": { + "version": "0.27.7", + "resolved": "https://registry.npmjs.org/@esbuild/android-arm64/-/android-arm64-0.27.7.tgz", + "integrity": "sha512-62dPZHpIXzvChfvfLJow3q5dDtiNMkwiRzPylSCfriLvZeq0a1bWChrGx/BbUbPwOrsWKMn8idSllklzBy+dgQ==", "cpu": [ - "x64" + "arm64" ], "dev": true, "license": "MIT", "optional": true, "os": [ - "freebsd" - ] + "android" + ], + "engines": { + "node": ">=18" + } }, - "node_modules/@rollup/rollup-linux-arm-gnueabihf": { - "version": "4.60.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-gnueabihf/-/rollup-linux-arm-gnueabihf-4.60.1.tgz", - "integrity": "sha512-L+34Qqil+v5uC0zEubW7uByo78WOCIrBvci69E7sFASRl0X7b/MB6Cqd1lky/CtcSVTydWa2WZwFuWexjS5o6g==", + "node_modules/@esbuild/android-x64": { + "version": "0.27.7", + "resolved": "https://registry.npmjs.org/@esbuild/android-x64/-/android-x64-0.27.7.tgz", + "integrity": "sha512-x5VpMODneVDb70PYV2VQOmIUUiBtY3D3mPBG8NxVk5CogneYhkR7MmM3yR/uMdITLrC1ml/NV1rj4bMJuy9MCg==", "cpu": [ - "arm" + "x64" ], "dev": true, "license": "MIT", "optional": true, "os": [ - "linux" - ] + "android" + ], + "engines": { + "node": ">=18" + } }, - "node_modules/@rollup/rollup-linux-arm-musleabihf": { - "version": "4.60.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-musleabihf/-/rollup-linux-arm-musleabihf-4.60.1.tgz", - "integrity": "sha512-n83O8rt4v34hgFzlkb1ycniJh7IR5RCIqt6mz1VRJD6pmhRi0CXdmfnLu9dIUS6buzh60IvACM842Ffb3xd6Gg==", + "node_modules/@esbuild/darwin-arm64": { + "version": "0.27.7", + "resolved": "https://registry.npmjs.org/@esbuild/darwin-arm64/-/darwin-arm64-0.27.7.tgz", + "integrity": "sha512-5lckdqeuBPlKUwvoCXIgI2D9/ABmPq3Rdp7IfL70393YgaASt7tbju3Ac+ePVi3KDH6N2RqePfHnXkaDtY9fkw==", "cpu": [ - "arm" + "arm64" ], "dev": true, "license": "MIT", "optional": true, "os": [ - "linux" - ] + "darwin" + ], + "engines": { + "node": ">=18" + } }, - "node_modules/@rollup/rollup-linux-arm64-gnu": { - "version": "4.60.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-gnu/-/rollup-linux-arm64-gnu-4.60.1.tgz", - "integrity": "sha512-Nql7sTeAzhTAja3QXeAI48+/+GjBJ+QmAH13snn0AJSNL50JsDqotyudHyMbO2RbJkskbMbFJfIJKWA6R1LCJQ==", + "node_modules/@esbuild/darwin-x64": { + "version": "0.27.7", + "resolved": "https://registry.npmjs.org/@esbuild/darwin-x64/-/darwin-x64-0.27.7.tgz", + "integrity": "sha512-rYnXrKcXuT7Z+WL5K980jVFdvVKhCHhUwid+dDYQpH+qu+TefcomiMAJpIiC2EM3Rjtq0sO3StMV/+3w3MyyqQ==", "cpu": [ - "arm64" + "x64" ], "dev": true, "license": "MIT", "optional": true, "os": [ - "linux" - ] + "darwin" + ], + "engines": { + "node": ">=18" + } }, - "node_modules/@rollup/rollup-linux-arm64-musl": { - "version": "4.60.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-musl/-/rollup-linux-arm64-musl-4.60.1.tgz", - "integrity": "sha512-+pUymDhd0ys9GcKZPPWlFiZ67sTWV5UU6zOJat02M1+PiuSGDziyRuI/pPue3hoUwm2uGfxdL+trT6Z9rxnlMA==", + "node_modules/@esbuild/freebsd-arm64": { + "version": "0.27.7", + "resolved": "https://registry.npmjs.org/@esbuild/freebsd-arm64/-/freebsd-arm64-0.27.7.tgz", + "integrity": "sha512-B48PqeCsEgOtzME2GbNM2roU29AMTuOIN91dsMO30t+Ydis3z/3Ngoj5hhnsOSSwNzS+6JppqWsuhTp6E82l2w==", "cpu": [ "arm64" ], @@ -1860,139 +1943,169 @@ "license": "MIT", "optional": true, "os": [ - "linux" - ] + "freebsd" + ], + "engines": { + "node": ">=18" + } }, - "node_modules/@rollup/rollup-linux-loong64-gnu": { - "version": "4.60.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-loong64-gnu/-/rollup-linux-loong64-gnu-4.60.1.tgz", - "integrity": "sha512-VSvgvQeIcsEvY4bKDHEDWcpW4Yw7BtlKG1GUT4FzBUlEKQK0rWHYBqQt6Fm2taXS+1bXvJT6kICu5ZwqKCnvlQ==", + "node_modules/@esbuild/freebsd-x64": { + "version": "0.27.7", + "resolved": "https://registry.npmjs.org/@esbuild/freebsd-x64/-/freebsd-x64-0.27.7.tgz", + "integrity": "sha512-jOBDK5XEjA4m5IJK3bpAQF9/Lelu/Z9ZcdhTRLf4cajlB+8VEhFFRjWgfy3M1O4rO2GQ/b2dLwCUGpiF/eATNQ==", "cpu": [ - "loong64" + "x64" ], "dev": true, "license": "MIT", "optional": true, "os": [ - "linux" - ] + "freebsd" + ], + "engines": { + "node": ">=18" + } }, - "node_modules/@rollup/rollup-linux-loong64-musl": { - "version": "4.60.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-loong64-musl/-/rollup-linux-loong64-musl-4.60.1.tgz", - "integrity": "sha512-4LqhUomJqwe641gsPp6xLfhqWMbQV04KtPp7/dIp0nzPxAkNY1AbwL5W0MQpcalLYk07vaW9Kp1PBhdpZYYcEw==", + "node_modules/@esbuild/linux-arm": { + "version": "0.27.7", + "resolved": "https://registry.npmjs.org/@esbuild/linux-arm/-/linux-arm-0.27.7.tgz", + "integrity": "sha512-RkT/YXYBTSULo3+af8Ib0ykH8u2MBh57o7q/DAs3lTJlyVQkgQvlrPTnjIzzRPQyavxtPtfg0EopvDyIt0j1rA==", "cpu": [ - "loong64" + "arm" ], "dev": true, "license": "MIT", "optional": true, "os": [ "linux" - ] + ], + "engines": { + "node": ">=18" + } }, - "node_modules/@rollup/rollup-linux-ppc64-gnu": { - "version": "4.60.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-ppc64-gnu/-/rollup-linux-ppc64-gnu-4.60.1.tgz", - "integrity": "sha512-tLQQ9aPvkBxOc/EUT6j3pyeMD6Hb8QF2BTBnCQWP/uu1lhc9AIrIjKnLYMEroIz/JvtGYgI9dF3AxHZNaEH0rw==", + "node_modules/@esbuild/linux-arm64": { + "version": "0.27.7", + "resolved": "https://registry.npmjs.org/@esbuild/linux-arm64/-/linux-arm64-0.27.7.tgz", + "integrity": "sha512-RZPHBoxXuNnPQO9rvjh5jdkRmVizktkT7TCDkDmQ0W2SwHInKCAV95GRuvdSvA7w4VMwfCjUiPwDi0ZO6Nfe9A==", "cpu": [ - "ppc64" + "arm64" ], "dev": true, "license": "MIT", "optional": true, "os": [ "linux" - ] + ], + "engines": { + "node": ">=18" + } }, - "node_modules/@rollup/rollup-linux-ppc64-musl": { - "version": "4.60.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-ppc64-musl/-/rollup-linux-ppc64-musl-4.60.1.tgz", - "integrity": "sha512-RMxFhJwc9fSXP6PqmAz4cbv3kAyvD1etJFjTx4ONqFP9DkTkXsAMU4v3Vyc5BgzC+anz7nS/9tp4obsKfqkDHg==", + "node_modules/@esbuild/linux-ia32": { + "version": "0.27.7", + "resolved": "https://registry.npmjs.org/@esbuild/linux-ia32/-/linux-ia32-0.27.7.tgz", + "integrity": "sha512-GA48aKNkyQDbd3KtkplYWT102C5sn/EZTY4XROkxONgruHPU72l+gW+FfF8tf2cFjeHaRbWpOYa/uRBz/Xq1Pg==", "cpu": [ - "ppc64" + "ia32" ], "dev": true, "license": "MIT", "optional": true, "os": [ "linux" - ] + ], + "engines": { + "node": ">=18" + } }, - "node_modules/@rollup/rollup-linux-riscv64-gnu": { - "version": "4.60.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-gnu/-/rollup-linux-riscv64-gnu-4.60.1.tgz", - "integrity": "sha512-QKgFl+Yc1eEk6MmOBfRHYF6lTxiiiV3/z/BRrbSiW2I7AFTXoBFvdMEyglohPj//2mZS4hDOqeB0H1ACh3sBbg==", + "node_modules/@esbuild/linux-loong64": { + "version": "0.27.7", + "resolved": "https://registry.npmjs.org/@esbuild/linux-loong64/-/linux-loong64-0.27.7.tgz", + "integrity": "sha512-a4POruNM2oWsD4WKvBSEKGIiWQF8fZOAsycHOt6JBpZ+JN2n2JH9WAv56SOyu9X5IqAjqSIPTaJkqN8F7XOQ5Q==", "cpu": [ - "riscv64" + "loong64" ], "dev": true, "license": "MIT", "optional": true, "os": [ "linux" - ] + ], + "engines": { + "node": ">=18" + } }, - "node_modules/@rollup/rollup-linux-riscv64-musl": { - "version": "4.60.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-musl/-/rollup-linux-riscv64-musl-4.60.1.tgz", - "integrity": "sha512-RAjXjP/8c6ZtzatZcA1RaQr6O1TRhzC+adn8YZDnChliZHviqIjmvFwHcxi4JKPSDAt6Uhf/7vqcBzQJy0PDJg==", + "node_modules/@esbuild/linux-mips64el": { + "version": "0.27.7", + "resolved": "https://registry.npmjs.org/@esbuild/linux-mips64el/-/linux-mips64el-0.27.7.tgz", + "integrity": "sha512-KabT5I6StirGfIz0FMgl1I+R1H73Gp0ofL9A3nG3i/cYFJzKHhouBV5VWK1CSgKvVaG4q1RNpCTR2LuTVB3fIw==", "cpu": [ - "riscv64" + "mips64el" ], "dev": true, "license": "MIT", "optional": true, "os": [ "linux" - ] + ], + "engines": { + "node": ">=18" + } }, - "node_modules/@rollup/rollup-linux-s390x-gnu": { - "version": "4.60.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-s390x-gnu/-/rollup-linux-s390x-gnu-4.60.1.tgz", - "integrity": "sha512-wcuocpaOlaL1COBYiA89O6yfjlp3RwKDeTIA0hM7OpmhR1Bjo9j31G1uQVpDlTvwxGn2nQs65fBFL5UFd76FcQ==", + "node_modules/@esbuild/linux-ppc64": { + "version": "0.27.7", + "resolved": "https://registry.npmjs.org/@esbuild/linux-ppc64/-/linux-ppc64-0.27.7.tgz", + "integrity": "sha512-gRsL4x6wsGHGRqhtI+ifpN/vpOFTQtnbsupUF5R5YTAg+y/lKelYR1hXbnBdzDjGbMYjVJLJTd2OFmMewAgwlQ==", "cpu": [ - "s390x" + "ppc64" ], "dev": true, "license": "MIT", "optional": true, "os": [ "linux" - ] + ], + "engines": { + "node": ">=18" + } }, - "node_modules/@rollup/rollup-linux-x64-gnu": { - "version": "4.60.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-gnu/-/rollup-linux-x64-gnu-4.60.1.tgz", - "integrity": "sha512-77PpsFQUCOiZR9+LQEFg9GClyfkNXj1MP6wRnzYs0EeWbPcHs02AXu4xuUbM1zhwn3wqaizle3AEYg5aeoohhg==", + "node_modules/@esbuild/linux-riscv64": { + "version": "0.27.7", + "resolved": "https://registry.npmjs.org/@esbuild/linux-riscv64/-/linux-riscv64-0.27.7.tgz", + "integrity": "sha512-hL25LbxO1QOngGzu2U5xeXtxXcW+/GvMN3ejANqXkxZ/opySAZMrc+9LY/WyjAan41unrR3YrmtTsUpwT66InQ==", "cpu": [ - "x64" + "riscv64" ], "dev": true, "license": "MIT", "optional": true, "os": [ "linux" - ] + ], + "engines": { + "node": ">=18" + } }, - "node_modules/@rollup/rollup-linux-x64-musl": { - "version": "4.60.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-musl/-/rollup-linux-x64-musl-4.60.1.tgz", - "integrity": "sha512-5cIATbk5vynAjqqmyBjlciMJl1+R/CwX9oLk/EyiFXDWd95KpHdrOJT//rnUl4cUcskrd0jCCw3wpZnhIHdD9w==", + "node_modules/@esbuild/linux-s390x": { + "version": "0.27.7", + "resolved": "https://registry.npmjs.org/@esbuild/linux-s390x/-/linux-s390x-0.27.7.tgz", + "integrity": "sha512-2k8go8Ycu1Kb46vEelhu1vqEP+UeRVj2zY1pSuPdgvbd5ykAw82Lrro28vXUrRmzEsUV0NzCf54yARIK8r0fdw==", "cpu": [ - "x64" + "s390x" ], "dev": true, "license": "MIT", "optional": true, "os": [ "linux" - ] + ], + "engines": { + "node": ">=18" + } }, - "node_modules/@rollup/rollup-openbsd-x64": { - "version": "4.60.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-openbsd-x64/-/rollup-openbsd-x64-4.60.1.tgz", - "integrity": "sha512-cl0w09WsCi17mcmWqqglez9Gk8isgeWvoUZ3WiJFYSR3zjBQc2J5/ihSjpl+VLjPqjQ/1hJRcqBfLjssREQILw==", + "node_modules/@esbuild/linux-x64": { + "version": "0.27.7", + "resolved": "https://registry.npmjs.org/@esbuild/linux-x64/-/linux-x64-0.27.7.tgz", + "integrity": "sha512-hzznmADPt+OmsYzw1EE33ccA+HPdIqiCRq7cQeL1Jlq2gb1+OyWBkMCrYGBJ+sxVzve2ZJEVeePbLM2iEIZSxA==", "cpu": [ "x64" ], @@ -2000,13 +2113,16 @@ "license": "MIT", "optional": true, "os": [ - "openbsd" - ] + "linux" + ], + "engines": { + "node": ">=18" + } }, - "node_modules/@rollup/rollup-openharmony-arm64": { - "version": "4.60.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-openharmony-arm64/-/rollup-openharmony-arm64-4.60.1.tgz", - "integrity": "sha512-4Cv23ZrONRbNtbZa37mLSueXUCtN7MXccChtKpUnQNgF010rjrjfHx3QxkS2PI7LqGT5xXyYs1a7LbzAwT0iCA==", + "node_modules/@esbuild/netbsd-arm64": { + "version": "0.27.7", + "resolved": "https://registry.npmjs.org/@esbuild/netbsd-arm64/-/netbsd-arm64-0.27.7.tgz", + "integrity": "sha512-b6pqtrQdigZBwZxAn1UpazEisvwaIDvdbMbmrly7cDTMFnw/+3lVxxCTGOrkPVnsYIosJJXAsILG9XcQS+Yu6w==", "cpu": [ "arm64" ], @@ -2014,41 +2130,50 @@ "license": "MIT", "optional": true, "os": [ - "openharmony" - ] + "netbsd" + ], + "engines": { + "node": ">=18" + } }, - "node_modules/@rollup/rollup-win32-arm64-msvc": { - "version": "4.60.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-arm64-msvc/-/rollup-win32-arm64-msvc-4.60.1.tgz", - "integrity": "sha512-i1okWYkA4FJICtr7KpYzFpRTHgy5jdDbZiWfvny21iIKky5YExiDXP+zbXzm3dUcFpkEeYNHgQ5fuG236JPq0g==", + "node_modules/@esbuild/netbsd-x64": { + "version": "0.27.7", + "resolved": "https://registry.npmjs.org/@esbuild/netbsd-x64/-/netbsd-x64-0.27.7.tgz", + "integrity": "sha512-OfatkLojr6U+WN5EDYuoQhtM+1xco+/6FSzJJnuWiUw5eVcicbyK3dq5EeV/QHT1uy6GoDhGbFpprUiHUYggrw==", "cpu": [ - "arm64" + "x64" ], "dev": true, "license": "MIT", "optional": true, "os": [ - "win32" - ] + "netbsd" + ], + "engines": { + "node": ">=18" + } }, - "node_modules/@rollup/rollup-win32-ia32-msvc": { - "version": "4.60.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-ia32-msvc/-/rollup-win32-ia32-msvc-4.60.1.tgz", - "integrity": "sha512-u09m3CuwLzShA0EYKMNiFgcjjzwqtUMLmuCJLeZWjjOYA3IT2Di09KaxGBTP9xVztWyIWjVdsB2E9goMjZvTQg==", + "node_modules/@esbuild/openbsd-arm64": { + "version": "0.27.7", + "resolved": "https://registry.npmjs.org/@esbuild/openbsd-arm64/-/openbsd-arm64-0.27.7.tgz", + "integrity": "sha512-AFuojMQTxAz75Fo8idVcqoQWEHIXFRbOc1TrVcFSgCZtQfSdc1RXgB3tjOn/krRHENUB4j00bfGjyl2mJrU37A==", "cpu": [ - "ia32" + "arm64" ], "dev": true, "license": "MIT", "optional": true, "os": [ - "win32" - ] + "openbsd" + ], + "engines": { + "node": ">=18" + } }, - "node_modules/@rollup/rollup-win32-x64-gnu": { - "version": "4.60.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-gnu/-/rollup-win32-x64-gnu-4.60.1.tgz", - "integrity": "sha512-k+600V9Zl1CM7eZxJgMyTUzmrmhB/0XZnF4pRypKAlAgxmedUA+1v9R+XOFv56W4SlHEzfeMtzujLJD22Uz5zg==", + "node_modules/@esbuild/openbsd-x64": { + "version": "0.27.7", + "resolved": "https://registry.npmjs.org/@esbuild/openbsd-x64/-/openbsd-x64-0.27.7.tgz", + "integrity": "sha512-+A1NJmfM8WNDv5CLVQYJ5PshuRm/4cI6WMZRg1by1GwPIQPCTs1GLEUHwiiQGT5zDdyLiRM/l1G0Pv54gvtKIg==", "cpu": [ "x64" ], @@ -2056,958 +2181,3652 @@ "license": "MIT", "optional": true, "os": [ - "win32" - ] + "openbsd" + ], + "engines": { + "node": ">=18" + } }, - "node_modules/@rollup/rollup-win32-x64-msvc": { - "version": "4.60.1", - "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-msvc/-/rollup-win32-x64-msvc-4.60.1.tgz", - "integrity": "sha512-lWMnixq/QzxyhTV6NjQJ4SFo1J6PvOX8vUx5Wb4bBPsEb+8xZ89Bz6kOXpfXj9ak9AHTQVQzlgzBEc1SyM27xQ==", + "node_modules/@esbuild/openharmony-arm64": { + "version": "0.27.7", + "resolved": "https://registry.npmjs.org/@esbuild/openharmony-arm64/-/openharmony-arm64-0.27.7.tgz", + "integrity": "sha512-+KrvYb/C8zA9CU/g0sR6w2RBw7IGc5J2BPnc3dYc5VJxHCSF1yNMxTV5LQ7GuKteQXZtspjFbiuW5/dOj7H4Yw==", "cpu": [ - "x64" + "arm64" ], "dev": true, "license": "MIT", "optional": true, "os": [ - "win32" - ] - }, - "node_modules/@standard-schema/spec": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/@standard-schema/spec/-/spec-1.1.0.tgz", - "integrity": "sha512-l2aFy5jALhniG5HgqrD6jXLi/rUWrKvqN/qJx6yoJsgKhblVd+iqqU4RCXavm/jPityDo5TCvKMnpjKnOriy0w==", - "dev": true, - "license": "MIT" + "openharmony" + ], + "engines": { + "node": ">=18" + } }, - "node_modules/@storybook/addon-a11y": { - "version": "10.2.4", - "resolved": "https://registry.npmjs.org/@storybook/addon-a11y/-/addon-a11y-10.2.4.tgz", - "integrity": "sha512-VGhdZ+iP2l/CSulIKV2kt3SMWVHntOigqWqGkNYf6YNYofynUYEKdsNqBvHx4ySuNEl/eXJ8LRO8FKYnU7LxZQ==", + "node_modules/@esbuild/sunos-x64": { + "version": "0.27.7", + "resolved": "https://registry.npmjs.org/@esbuild/sunos-x64/-/sunos-x64-0.27.7.tgz", + "integrity": "sha512-ikktIhFBzQNt/QDyOL580ti9+5mL/YZeUPKU2ivGtGjdTYoqz6jObj6nOMfhASpS4GU4Q/Clh1QtxWAvcYKamA==", + "cpu": [ + "x64" + ], "dev": true, "license": "MIT", - "dependencies": { - "@storybook/global": "^5.0.0", - "axe-core": "^4.2.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/storybook" - }, - "peerDependencies": { - "storybook": "^10.2.4" + "optional": true, + "os": [ + "sunos" + ], + "engines": { + "node": ">=18" } }, - "node_modules/@storybook/addon-docs": { - "version": "10.2.4", - "resolved": "https://registry.npmjs.org/@storybook/addon-docs/-/addon-docs-10.2.4.tgz", - "integrity": "sha512-FzscAmdBiOGnGrxiEM+8eTg43kjqgjLfObg+lbJVRR/a0DmZ3xfAPNB0+VKYQbN0FacNcWLM9LZ/7U0hRBPBnQ==", + "node_modules/@esbuild/win32-arm64": { + "version": "0.27.7", + "resolved": "https://registry.npmjs.org/@esbuild/win32-arm64/-/win32-arm64-0.27.7.tgz", + "integrity": "sha512-7yRhbHvPqSpRUV7Q20VuDwbjW5kIMwTHpptuUzV+AA46kiPze5Z7qgt6CLCK3pWFrHeNfDd1VKgyP4O+ng17CA==", + "cpu": [ + "arm64" + ], "dev": true, "license": "MIT", - "dependencies": { - "@mdx-js/react": "^3.0.0", - "@storybook/csf-plugin": "10.2.4", - "@storybook/icons": "^2.0.1", - "@storybook/react-dom-shim": "10.2.4", - "react": "^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0", - "react-dom": "^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0", - "ts-dedent": "^2.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/storybook" - }, - "peerDependencies": { - "storybook": "^10.2.4" + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=18" } }, - "node_modules/@storybook/addon-svelte-csf": { - "version": "5.0.10", - "resolved": "https://registry.npmjs.org/@storybook/addon-svelte-csf/-/addon-svelte-csf-5.0.10.tgz", - "integrity": "sha512-poSvTS7VdaQ42ZoqW5e4+2Hv1iLO0mekH9fwn/QuBNse48R4WlTyR8XFbHRTfatl9gdc9ZYC4uWzazrmV6zGIA==", + "node_modules/@esbuild/win32-ia32": { + "version": "0.27.7", + "resolved": "https://registry.npmjs.org/@esbuild/win32-ia32/-/win32-ia32-0.27.7.tgz", + "integrity": "sha512-SmwKXe6VHIyZYbBLJrhOoCJRB/Z1tckzmgTLfFYOfpMAx63BJEaL9ExI8x7v0oAO3Zh6D/Oi1gVxEYr5oUCFhw==", + "cpu": [ + "ia32" + ], "dev": true, "license": "MIT", - "dependencies": { - "@storybook/csf": "^0.1.13", - "dedent": "^1.5.3", - "es-toolkit": "^1.26.1", - "esrap": "^1.2.2", - "magic-string": "^0.30.12", - "svelte-ast-print": "^0.4.0", - "zimmerframe": "^1.1.2" - }, - "peerDependencies": { - "@storybook/svelte": "^0.0.0-0 || ^8.2.0 || ^9.0.0 || ^9.1.0-0 || ^10.0.0-0", - "@sveltejs/vite-plugin-svelte": "^4.0.0 || ^5.0.0 || ^6.0.0", - "storybook": "^0.0.0-0 || ^8.2.0 || ^9.0.0 || ^9.1.0-0 || ^10.0.0-0", - "svelte": "^5.0.0", - "vite": "^5.0.0 || ^6.0.0 || ^7.0.0" + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=18" } }, - "node_modules/@storybook/addon-vitest": { - "version": "10.2.4", - "resolved": "https://registry.npmjs.org/@storybook/addon-vitest/-/addon-vitest-10.2.4.tgz", - "integrity": "sha512-BT1iP89U4wcbpzTURU8WYTAeUcdNh4WIt0BqsnATmMwR/jKNJW6QgXCVqGQTSpRjWj40hX5e2JkQYCNXdjKsPw==", + "node_modules/@esbuild/win32-x64": { + "version": "0.27.7", + "resolved": "https://registry.npmjs.org/@esbuild/win32-x64/-/win32-x64-0.27.7.tgz", + "integrity": "sha512-56hiAJPhwQ1R4i+21FVF7V8kSD5zZTdHcVuRFMW0hn753vVfQN8xlx4uOPT4xoGH0Z/oVATuR82AiqSTDIpaHg==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@eslint-community/eslint-utils": { + "version": "4.9.1", + "resolved": "https://registry.npmjs.org/@eslint-community/eslint-utils/-/eslint-utils-4.9.1.tgz", + "integrity": "sha512-phrYmNiYppR7znFEdqgfWHXR6NCkZEK7hwWDHZUjit/2/U0r6XvkDl0SYnoM51Hq7FhCGdLDT6zxCCOY1hexsQ==", "dev": true, "license": "MIT", "dependencies": { - "@storybook/global": "^5.0.0", - "@storybook/icons": "^2.0.1" + "eslint-visitor-keys": "^3.4.3" + }, + "engines": { + "node": "^12.22.0 || ^14.17.0 || >=16.0.0" }, "funding": { - "type": "opencollective", - "url": "https://opencollective.com/storybook" + "url": "https://opencollective.com/eslint" }, "peerDependencies": { - "@vitest/browser": "^3.0.0 || ^4.0.0", - "@vitest/browser-playwright": "^4.0.0", - "@vitest/runner": "^3.0.0 || ^4.0.0", - "storybook": "^10.2.4", - "vitest": "^3.0.0 || ^4.0.0" - }, - "peerDependenciesMeta": { - "@vitest/browser": { - "optional": true - }, - "@vitest/browser-playwright": { - "optional": true - }, - "@vitest/runner": { - "optional": true - }, - "vitest": { - "optional": true - } + "eslint": "^6.0.0 || ^7.0.0 || >=8.0.0" } }, - "node_modules/@storybook/builder-vite": { - "version": "10.2.4", - "resolved": "https://registry.npmjs.org/@storybook/builder-vite/-/builder-vite-10.2.4.tgz", - "integrity": "sha512-/hcT1xj3CL5GkJ5v5/EguZdttDwNE6weNXK7vKzp034tnGcLycOossDsTiUQkBowSL+Ylc8aKj+ZgvddPNfOig==", + "node_modules/@eslint-community/eslint-utils/node_modules/eslint-visitor-keys": { + "version": "3.4.3", + "resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-3.4.3.tgz", + "integrity": "sha512-wpc+LXeiyiisxPlEkUzU6svyS1frIO3Mgxj1fdy7Pm8Ygzguax2N3Fa/D/ag1WqbOprdI+uY6wMUl8/a2G+iag==", "dev": true, - "license": "MIT", - "dependencies": { - "@storybook/csf-plugin": "10.2.4", - "ts-dedent": "^2.0.0" + "license": "Apache-2.0", + "engines": { + "node": "^12.22.0 || ^14.17.0 || >=16.0.0" }, "funding": { - "type": "opencollective", - "url": "https://opencollective.com/storybook" - }, - "peerDependencies": { - "storybook": "^10.2.4", - "vite": "^5.0.0 || ^6.0.0 || ^7.0.0" + "url": "https://opencollective.com/eslint" } }, - "node_modules/@storybook/csf": { - "version": "0.1.13", - "resolved": "https://registry.npmjs.org/@storybook/csf/-/csf-0.1.13.tgz", - "integrity": "sha512-7xOOwCLGB3ebM87eemep89MYRFTko+D8qE7EdAAq74lgdqRR5cOUtYWJLjO2dLtP94nqoOdHJo6MdLLKzg412Q==", + "node_modules/@eslint-community/regexpp": { + "version": "4.12.2", + "resolved": "https://registry.npmjs.org/@eslint-community/regexpp/-/regexpp-4.12.2.tgz", + "integrity": "sha512-EriSTlt5OC9/7SXkRSCAhfSxxoSUgBm33OH+IkwbdpgoqsSsUg7y3uh+IICI/Qg4BBWr3U2i39RpmycbxMq4ew==", "dev": true, "license": "MIT", - "dependencies": { - "type-fest": "^2.19.0" + "engines": { + "node": "^12.0.0 || ^14.0.0 || >=16.0.0" } }, - "node_modules/@storybook/csf-plugin": { - "version": "10.2.4", - "resolved": "https://registry.npmjs.org/@storybook/csf-plugin/-/csf-plugin-10.2.4.tgz", - "integrity": "sha512-kupPQEV+4N9mzsZHYaokvhO/KHBjYdWda9PNmPQwy0TR7r2mzthgaNH72TjmgN1L6DIbsuyOG1wtczcPJn4+Jg==", + "node_modules/@eslint/compat": { + "version": "1.4.1", + "resolved": "https://registry.npmjs.org/@eslint/compat/-/compat-1.4.1.tgz", + "integrity": "sha512-cfO82V9zxxGBxcQDr1lfaYB7wykTa0b00mGa36FrJl7iTFd0Z2cHfEYuxcBRP/iNijCsWsEkA+jzT8hGYmv33w==", "dev": true, - "license": "MIT", + "license": "Apache-2.0", "dependencies": { - "unplugin": "^2.3.5" + "@eslint/core": "^0.17.0" }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/storybook" + "engines": { + "node": "^18.18.0 || ^20.9.0 || >=21.1.0" }, "peerDependencies": { - "esbuild": "*", - "rollup": "*", - "storybook": "^10.2.4", - "vite": "*", - "webpack": "*" + "eslint": "^8.40 || 9" }, "peerDependenciesMeta": { - "esbuild": { - "optional": true - }, - "rollup": { - "optional": true - }, - "vite": { - "optional": true - }, - "webpack": { + "eslint": { "optional": true } } }, - "node_modules/@storybook/global": { - "version": "5.0.0", - "resolved": "https://registry.npmjs.org/@storybook/global/-/global-5.0.0.tgz", - "integrity": "sha512-FcOqPAXACP0I3oJ/ws6/rrPT9WGhu915Cg8D02a9YxLo0DE9zI+a9A5gRGvmQ09fiWPukqI8ZAEoQEdWUKMQdQ==", - "dev": true, - "license": "MIT" - }, - "node_modules/@storybook/icons": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/@storybook/icons/-/icons-2.0.1.tgz", - "integrity": "sha512-/smVjw88yK3CKsiuR71vNgWQ9+NuY2L+e8X7IMrFjexjm6ZR8ULrV2DRkTA61aV6ryefslzHEGDInGpnNeIocg==", + "node_modules/@eslint/config-array": { + "version": "0.21.2", + "resolved": "https://registry.npmjs.org/@eslint/config-array/-/config-array-0.21.2.tgz", + "integrity": "sha512-nJl2KGTlrf9GjLimgIru+V/mzgSK0ABCDQRvxw5BjURL7WfH5uoWmizbH7QB6MmnMBd8cIC9uceWnezL1VZWWw==", "dev": true, - "license": "MIT", - "peerDependencies": { - "react": "^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0", - "react-dom": "^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0" + "license": "Apache-2.0", + "dependencies": { + "@eslint/object-schema": "^2.1.7", + "debug": "^4.3.1", + "minimatch": "^3.1.5" + }, + "engines": { + "node": "^18.18.0 || ^20.9.0 || >=21.1.0" } }, - "node_modules/@storybook/react-dom-shim": { - "version": "10.2.4", - "resolved": "https://registry.npmjs.org/@storybook/react-dom-shim/-/react-dom-shim-10.2.4.tgz", - "integrity": "sha512-i22OtrZ7GeZPt/odLf0vqyDhRSKyaLsHkkKSBcANQfzRRnBZmiz2FchOtWm9uvoDWybQsTruZq7kTdtpEhwyGw==", + "node_modules/@eslint/config-helpers": { + "version": "0.4.2", + "resolved": "https://registry.npmjs.org/@eslint/config-helpers/-/config-helpers-0.4.2.tgz", + "integrity": "sha512-gBrxN88gOIf3R7ja5K9slwNayVcZgK6SOUORm2uBzTeIEfeVaIhOpCtTox3P6R7o2jLFwLFTLnC7kU/RGcYEgw==", "dev": true, - "license": "MIT", - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/storybook" + "license": "Apache-2.0", + "dependencies": { + "@eslint/core": "^0.17.0" }, - "peerDependencies": { - "react": "^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0", - "react-dom": "^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0", - "storybook": "^10.2.4" + "engines": { + "node": "^18.18.0 || ^20.9.0 || >=21.1.0" } }, - "node_modules/@storybook/svelte": { - "version": "10.2.4", - "resolved": "https://registry.npmjs.org/@storybook/svelte/-/svelte-10.2.4.tgz", - "integrity": "sha512-W9R51zUCd2iHOQBg/D93+bdpYv6kbtFx+kft5X8lPKQl6yEu0aKs9i5N5GyCASOhIApgx/tkqZIJ7vgM4cqrHA==", + "node_modules/@eslint/core": { + "version": "0.17.0", + "resolved": "https://registry.npmjs.org/@eslint/core/-/core-0.17.0.tgz", + "integrity": "sha512-yL/sLrpmtDaFEiUj1osRP4TI2MDz1AddJL+jZ7KSqvBuliN4xqYY54IfdN8qD8Toa6g1iloph1fxQNkjOxrrpQ==", "dev": true, - "license": "MIT", + "license": "Apache-2.0", "dependencies": { - "ts-dedent": "^2.0.0", - "type-fest": "~2.19" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/storybook" + "@types/json-schema": "^7.0.15" }, - "peerDependencies": { - "storybook": "^10.2.4", - "svelte": "^5.0.0" + "engines": { + "node": "^18.18.0 || ^20.9.0 || >=21.1.0" } }, - "node_modules/@storybook/svelte-vite": { - "version": "10.2.4", - "resolved": "https://registry.npmjs.org/@storybook/svelte-vite/-/svelte-vite-10.2.4.tgz", - "integrity": "sha512-FMgKMRdoZFDwPD6eIDMldcgp6d6NtIGuXyUJjb29qLias/gE5TI6hg+cWmmWXQRTrXwdyepeMBmIfRcZbB6REQ==", + "node_modules/@eslint/eslintrc": { + "version": "3.3.5", + "resolved": "https://registry.npmjs.org/@eslint/eslintrc/-/eslintrc-3.3.5.tgz", + "integrity": "sha512-4IlJx0X0qftVsN5E+/vGujTRIFtwuLbNsVUe7TO6zYPDR1O6nFwvwhIKEKSrl6dZchmYBITazxKoUYOjdtjlRg==", "dev": true, "license": "MIT", "dependencies": { - "@storybook/builder-vite": "10.2.4", - "@storybook/svelte": "10.2.4", - "magic-string": "^0.30.0", - "svelte2tsx": "^0.7.44", - "typescript": "^4.9.4 || ^5.0.0" + "ajv": "^6.14.0", + "debug": "^4.3.2", + "espree": "^10.0.1", + "globals": "^14.0.0", + "ignore": "^5.2.0", + "import-fresh": "^3.2.1", + "js-yaml": "^4.1.1", + "minimatch": "^3.1.5", + "strip-json-comments": "^3.1.1" }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/storybook" + "engines": { + "node": "^18.18.0 || ^20.9.0 || >=21.1.0" }, - "peerDependencies": { - "@sveltejs/vite-plugin-svelte": "^2.0.0 || ^3.0.0 || ^4.0.0 || ^5.0.0 || ^6.0.0", - "storybook": "^10.2.4", - "svelte": "^5.0.0", - "vite": "^5.0.0 || ^6.0.0 || ^7.0.0" + "funding": { + "url": "https://opencollective.com/eslint" } }, - "node_modules/@storybook/sveltekit": { - "version": "10.2.4", - "resolved": "https://registry.npmjs.org/@storybook/sveltekit/-/sveltekit-10.2.4.tgz", - "integrity": "sha512-1qDX35iSJHWo1AOd7HMzJtCHBfgahXqTWNiyZa/JMEKJ3qC1otaU8XMmTjsZ6fCRF99piNdgqtWM8+s1TJOldg==", + "node_modules/@eslint/eslintrc/node_modules/ajv": { + "version": "6.15.0", + "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.15.0.tgz", + "integrity": "sha512-fgFx7Hfoq60ytK2c7DhnF8jIvzYgOMxfugjLOSMHjLIPgenqa7S7oaagATUq99mV6IYvN2tRmC0wnTYX6iPbMw==", "dev": true, "license": "MIT", "dependencies": { - "@storybook/builder-vite": "10.2.4", - "@storybook/svelte": "10.2.4", - "@storybook/svelte-vite": "10.2.4" + "fast-deep-equal": "^3.1.1", + "fast-json-stable-stringify": "^2.0.0", + "json-schema-traverse": "^0.4.1", + "uri-js": "^4.2.2" }, "funding": { - "type": "opencollective", - "url": "https://opencollective.com/storybook" - }, - "peerDependencies": { - "storybook": "^10.2.4", - "svelte": "^5.0.0", - "vite": "^5.0.0 || ^6.0.0 || ^7.0.0" + "type": "github", + "url": "https://github.com/sponsors/epoberezkin" } }, - "node_modules/@sveltejs/acorn-typescript": { - "version": "1.0.5", - "resolved": "https://registry.npmjs.org/@sveltejs/acorn-typescript/-/acorn-typescript-1.0.5.tgz", - "integrity": "sha512-IwQk4yfwLdibDlrXVE04jTZYlLnwsTT2PIOQQGNLWfjavGifnk1JD1LcZjZaBTRcxZu2FfPfNLOE04DSu9lqtQ==", + "node_modules/@eslint/eslintrc/node_modules/globals": { + "version": "14.0.0", + "resolved": "https://registry.npmjs.org/globals/-/globals-14.0.0.tgz", + "integrity": "sha512-oahGvuMGQlPw/ivIYBjVSrWAfWLBeku5tpPE2fOPLi+WHffIWbuh2tCjhyQhTBPMf5E9jDEH4FOmTYgYwbKwtQ==", "dev": true, "license": "MIT", - "peerDependencies": { - "acorn": "^8.9.0" + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" } }, - "node_modules/@sveltejs/adapter-static": { - "version": "3.0.10", - "resolved": "https://registry.npmjs.org/@sveltejs/adapter-static/-/adapter-static-3.0.10.tgz", - "integrity": "sha512-7D9lYFWJmB7zxZyTE/qxjksvMqzMuYrrsyh1f4AlZqeZeACPRySjbC3aFiY55wb1tWUaKOQG9PVbm74JcN2Iew==", + "node_modules/@eslint/eslintrc/node_modules/json-schema-traverse": { + "version": "0.4.1", + "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-0.4.1.tgz", + "integrity": "sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg==", "dev": true, - "license": "MIT", - "peerDependencies": { - "@sveltejs/kit": "^2.0.0" - } + "license": "MIT" }, - "node_modules/@sveltejs/kit": { - "version": "2.60.1", - "resolved": "https://registry.npmjs.org/@sveltejs/kit/-/kit-2.60.1.tgz", - "integrity": "sha512-mQjlkNo+rJvpln7V2IGY2j99BqhcFbS4UN0AQNKNYfhBAFZTuCDAdW3a1sgf330mvtNvsBXn3HpAhcmvdJTcIQ==", + "node_modules/@eslint/js": { + "version": "9.39.2", + "resolved": "https://registry.npmjs.org/@eslint/js/-/js-9.39.2.tgz", + "integrity": "sha512-q1mjIoW1VX4IvSocvM/vbTiveKC4k9eLrajNEuSsmjymSDEbpGddtpfOoN7YGAqBK3NG+uqo8ia4PDTt8buCYA==", "dev": true, "license": "MIT", - "dependencies": { - "@standard-schema/spec": "^1.0.0", - "@sveltejs/acorn-typescript": "^1.0.5", - "@types/cookie": "^0.6.0", - "acorn": "^8.14.1", - "cookie": "^0.6.0", - "devalue": "^5.8.1", - "esm-env": "^1.2.2", - "kleur": "^4.1.5", - "magic-string": "^0.30.5", - "mrmime": "^2.0.0", - "set-cookie-parser": "^3.0.0", - "sirv": "^3.0.0" - }, - "bin": { - "svelte-kit": "svelte-kit.js" - }, "engines": { - "node": ">=18.13" - }, - "peerDependencies": { - "@opentelemetry/api": "^1.0.0", - "@sveltejs/vite-plugin-svelte": "^3.0.0 || ^4.0.0-next.1 || ^5.0.0 || ^6.0.0-next.0 || ^7.0.0", - "svelte": "^4.0.0 || ^5.0.0-next.0", - "typescript": "^5.3.3 || ^6.0.0", - "vite": "^5.0.3 || ^6.0.0 || ^7.0.0-beta.0 || ^8.0.0" + "node": "^18.18.0 || ^20.9.0 || >=21.1.0" }, - "peerDependenciesMeta": { - "@opentelemetry/api": { - "optional": true - }, - "typescript": { - "optional": true - } + "funding": { + "url": "https://eslint.org/donate" } }, - "node_modules/@sveltejs/vite-plugin-svelte": { - "version": "6.2.1", - "resolved": "https://registry.npmjs.org/@sveltejs/vite-plugin-svelte/-/vite-plugin-svelte-6.2.1.tgz", - "integrity": "sha512-YZs/OSKOQAQCnJvM/P+F1URotNnYNeU3P2s4oIpzm1uFaqUEqRxUB0g5ejMjEb5Gjb9/PiBI5Ktrq4rUUF8UVQ==", + "node_modules/@eslint/object-schema": { + "version": "2.1.7", + "resolved": "https://registry.npmjs.org/@eslint/object-schema/-/object-schema-2.1.7.tgz", + "integrity": "sha512-VtAOaymWVfZcmZbp6E2mympDIHvyjXs/12LqWYjVw6qjrfF+VK+fyG33kChz3nnK+SU5/NeHOqrTEHS8sXO3OA==", "dev": true, - "license": "MIT", - "dependencies": { - "@sveltejs/vite-plugin-svelte-inspector": "^5.0.0", - "debug": "^4.4.1", - "deepmerge": "^4.3.1", - "magic-string": "^0.30.17", - "vitefu": "^1.1.1" - }, + "license": "Apache-2.0", "engines": { - "node": "^20.19 || ^22.12 || >=24" - }, - "peerDependencies": { - "svelte": "^5.0.0", - "vite": "^6.3.0 || ^7.0.0" + "node": "^18.18.0 || ^20.9.0 || >=21.1.0" } }, - "node_modules/@sveltejs/vite-plugin-svelte-inspector": { - "version": "5.0.0", - "resolved": "https://registry.npmjs.org/@sveltejs/vite-plugin-svelte-inspector/-/vite-plugin-svelte-inspector-5.0.0.tgz", - "integrity": "sha512-iwQ8Z4ET6ZFSt/gC+tVfcsSBHwsqc6RumSaiLUkAurW3BCpJam65cmHw0oOlDMTO0u+PZi9hilBRYN+LZNHTUQ==", + "node_modules/@eslint/plugin-kit": { + "version": "0.4.1", + "resolved": "https://registry.npmjs.org/@eslint/plugin-kit/-/plugin-kit-0.4.1.tgz", + "integrity": "sha512-43/qtrDUokr7LJqoF2c3+RInu/t4zfrpYdoSDfYyhg52rwLV6TnOvdG4fXm7IkSB3wErkcmJS9iEhjVtOSEjjA==", "dev": true, - "license": "MIT", + "license": "Apache-2.0", "dependencies": { - "debug": "^4.4.1" + "@eslint/core": "^0.17.0", + "levn": "^0.4.1" }, "engines": { - "node": "^20.19 || ^22.12 || >=24" - }, - "peerDependencies": { - "@sveltejs/vite-plugin-svelte": "^6.0.0-next.0", - "svelte": "^5.0.0", - "vite": "^6.3.0 || ^7.0.0" + "node": "^18.18.0 || ^20.9.0 || >=21.1.0" } }, - "node_modules/@swc/helpers": { - "version": "0.5.17", - "resolved": "https://registry.npmjs.org/@swc/helpers/-/helpers-0.5.17.tgz", - "integrity": "sha512-5IKx/Y13RsYd+sauPb2x+U/xZikHjolzfuDgTAl/Tdf3Q8rslRvC19NKDLgAJQ6wsqADk10ntlv08nPFw/gO/A==", + "node_modules/@floating-ui/core": { + "version": "1.7.5", + "resolved": "https://registry.npmjs.org/@floating-ui/core/-/core-1.7.5.tgz", + "integrity": "sha512-1Ih4WTWyw0+lKyFMcBHGbb5U5FtuHJuujoyyr5zTaWS5EYMeT6Jb2AuDeftsCsEuchO+mM2ij5+q9crhydzLhQ==", "dev": true, - "license": "Apache-2.0", + "license": "MIT", "dependencies": { - "tslib": "^2.8.0" + "@floating-ui/utils": "^0.2.11" } }, - "node_modules/@tailwindcss/forms": { - "version": "0.5.10", - "resolved": "https://registry.npmjs.org/@tailwindcss/forms/-/forms-0.5.10.tgz", - "integrity": "sha512-utI1ONF6uf/pPNO68kmN1b8rEwNXv3czukalo8VtJH8ksIkZXr3Q3VYudZLkCsDd4Wku120uF02hYK25XGPorw==", + "node_modules/@floating-ui/dom": { + "version": "1.7.6", + "resolved": "https://registry.npmjs.org/@floating-ui/dom/-/dom-1.7.6.tgz", + "integrity": "sha512-9gZSAI5XM36880PPMm//9dfiEngYoC6Am2izES1FF406YFsjvyBMmeJ2g4SAju3xWwtuynNRFL2s9hgxpLI5SQ==", "dev": true, "license": "MIT", "dependencies": { - "mini-svg-data-uri": "^1.2.3" + "@floating-ui/core": "^1.7.5", + "@floating-ui/utils": "^0.2.11" + } + }, + "node_modules/@floating-ui/utils": { + "version": "0.2.11", + "resolved": "https://registry.npmjs.org/@floating-ui/utils/-/utils-0.2.11.tgz", + "integrity": "sha512-RiB/yIh78pcIxl6lLMG0CgBXAZ2Y0eVHqMPYugu+9U0AeT6YBeiJpf7lbdJNIugFP5SIjwNRgo4DhR1Qxi26Gg==", + "dev": true, + "license": "MIT" + }, + "node_modules/@hono/node-server": { + "version": "1.19.14", + "resolved": "https://registry.npmjs.org/@hono/node-server/-/node-server-1.19.14.tgz", + "integrity": "sha512-GwtvgtXxnWsucXvbQXkRgqksiH2Qed37H9xHZocE5sA3N8O8O8/8FA3uclQXxXVzc9XBZuEOMK7+r02FmSpHtw==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=18.14.1" }, "peerDependencies": { - "tailwindcss": ">=3.0.0 || >= 3.0.0-alpha.1 || >= 4.0.0-alpha.20 || >= 4.0.0-beta.1" + "hono": "^4" } }, - "node_modules/@tailwindcss/node": { - "version": "4.1.11", - "resolved": "https://registry.npmjs.org/@tailwindcss/node/-/node-4.1.11.tgz", - "integrity": "sha512-yzhzuGRmv5QyU9qLNg4GTlYI6STedBWRE7NjxP45CsFYYq9taI0zJXZBMqIC/c8fViNLhmrbpSFS57EoxUmD6Q==", + "node_modules/@humanfs/core": { + "version": "0.19.2", + "resolved": "https://registry.npmjs.org/@humanfs/core/-/core-0.19.2.tgz", + "integrity": "sha512-UhXNm+CFMWcbChXywFwkmhqjs3PRCmcSa/hfBgLIb7oQ5HNb1wS0icWsGtSAUNgefHeI+eBrA8I1fxmbHsGdvA==", "dev": true, - "license": "MIT", + "license": "Apache-2.0", "dependencies": { - "@ampproject/remapping": "^2.3.0", - "enhanced-resolve": "^5.18.1", - "jiti": "^2.4.2", - "lightningcss": "1.30.1", - "magic-string": "^0.30.17", - "source-map-js": "^1.2.1", - "tailwindcss": "4.1.11" + "@humanfs/types": "^0.15.0" + }, + "engines": { + "node": ">=18.18.0" } }, - "node_modules/@tailwindcss/oxide": { - "version": "4.1.11", - "resolved": "https://registry.npmjs.org/@tailwindcss/oxide/-/oxide-4.1.11.tgz", - "integrity": "sha512-Q69XzrtAhuyfHo+5/HMgr1lAiPP/G40OMFAnws7xcFEYqcypZmdW8eGXaOUIeOl1dzPJBPENXgbjsOyhg2nkrg==", + "node_modules/@humanfs/node": { + "version": "0.16.8", + "resolved": "https://registry.npmjs.org/@humanfs/node/-/node-0.16.8.tgz", + "integrity": "sha512-gE1eQNZ3R++kTzFUpdGlpmy8kDZD/MLyHqDwqjkVQI0JMdI1D51sy1H958PNXYkM2rAac7e5/CnIKZrHtPh3BQ==", "dev": true, - "hasInstallScript": true, - "license": "MIT", + "license": "Apache-2.0", "dependencies": { - "detect-libc": "^2.0.4", - "tar": "^7.4.3" + "@humanfs/core": "^0.19.2", + "@humanfs/types": "^0.15.0", + "@humanwhocodes/retry": "^0.4.0" }, "engines": { - "node": ">= 10" - }, - "optionalDependencies": { - "@tailwindcss/oxide-android-arm64": "4.1.11", - "@tailwindcss/oxide-darwin-arm64": "4.1.11", - "@tailwindcss/oxide-darwin-x64": "4.1.11", - "@tailwindcss/oxide-freebsd-x64": "4.1.11", - "@tailwindcss/oxide-linux-arm-gnueabihf": "4.1.11", - "@tailwindcss/oxide-linux-arm64-gnu": "4.1.11", - "@tailwindcss/oxide-linux-arm64-musl": "4.1.11", - "@tailwindcss/oxide-linux-x64-gnu": "4.1.11", - "@tailwindcss/oxide-linux-x64-musl": "4.1.11", - "@tailwindcss/oxide-wasm32-wasi": "4.1.11", - "@tailwindcss/oxide-win32-arm64-msvc": "4.1.11", - "@tailwindcss/oxide-win32-x64-msvc": "4.1.11" + "node": ">=18.18.0" } }, - "node_modules/@tailwindcss/oxide-android-arm64": { - "version": "4.1.11", - "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-android-arm64/-/oxide-android-arm64-4.1.11.tgz", - "integrity": "sha512-3IfFuATVRUMZZprEIx9OGDjG3Ou3jG4xQzNTvjDoKmU9JdmoCohQJ83MYd0GPnQIu89YoJqvMM0G3uqLRFtetg==", - "cpu": [ - "arm64" - ], + "node_modules/@humanfs/types": { + "version": "0.15.0", + "resolved": "https://registry.npmjs.org/@humanfs/types/-/types-0.15.0.tgz", + "integrity": "sha512-ZZ1w0aoQkwuUuC7Yf+7sdeaNfqQiiLcSRbfI08oAxqLtpXQr9AIVX7Ay7HLDuiLYAaFPu8oBYNq/QIi9URHJ3Q==", "dev": true, - "license": "MIT", - "optional": true, - "os": [ - "android" - ], + "license": "Apache-2.0", "engines": { - "node": ">= 10" + "node": ">=18.18.0" } }, - "node_modules/@tailwindcss/oxide-darwin-arm64": { - "version": "4.1.11", - "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-darwin-arm64/-/oxide-darwin-arm64-4.1.11.tgz", - "integrity": "sha512-ESgStEOEsyg8J5YcMb1xl8WFOXfeBmrhAwGsFxxB2CxY9evy63+AtpbDLAyRkJnxLy2WsD1qF13E97uQyP1lfQ==", + "node_modules/@humanwhocodes/module-importer": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/@humanwhocodes/module-importer/-/module-importer-1.0.1.tgz", + "integrity": "sha512-bxveV4V8v5Yb4ncFTT3rPSgZBOpCkjfK0y4oVVVJwIuDVBRMDXrPyXRL988i5ap9m9bnyEEjWfm5WkBmtffLfA==", + "dev": true, + "license": "Apache-2.0", + "engines": { + "node": ">=12.22" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/nzakas" + } + }, + "node_modules/@humanwhocodes/retry": { + "version": "0.4.3", + "resolved": "https://registry.npmjs.org/@humanwhocodes/retry/-/retry-0.4.3.tgz", + "integrity": "sha512-bV0Tgo9K4hfPCek+aMAn81RppFKv2ySDQeMoSZuvTASywNTnVJCArCZE2FWqpvIatKu7VMRLWlR1EazvVhDyhQ==", + "dev": true, + "license": "Apache-2.0", + "engines": { + "node": ">=18.18" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/nzakas" + } + }, + "node_modules/@iconify/types": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/@iconify/types/-/types-2.0.0.tgz", + "integrity": "sha512-+wluvCrRhXrhyOmRDJ3q8mux9JkKy5SJ/v8ol2tu4FVjyYvtEzkc/3pK15ET6RKg4b4w4BmTk1+gsCUhf21Ykg==", + "dev": true, + "license": "MIT" + }, + "node_modules/@iconify/utils": { + "version": "3.1.3", + "resolved": "https://registry.npmjs.org/@iconify/utils/-/utils-3.1.3.tgz", + "integrity": "sha512-LPKOXPn/zV+zis1oOfGWogaXVpqUybF3ZS6SCZIsz8vg0ivVp9+fVqyYB7xq0aiST/VhUQYGO1qo6uoYSiEJqw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@antfu/install-pkg": "^1.1.0", + "@iconify/types": "^2.0.0", + "import-meta-resolve": "^4.2.0" + } + }, + "node_modules/@img/sharp-darwin-arm64": { + "version": "0.33.5", + "resolved": "https://registry.npmjs.org/@img/sharp-darwin-arm64/-/sharp-darwin-arm64-0.33.5.tgz", + "integrity": "sha512-UT4p+iz/2H4twwAoLCqfA9UH5pI6DggwKEGuaPy7nCVQ8ZsiY5PIcrRvD1DzuY3qYL07NtIQcWnBSY/heikIFQ==", "cpu": [ "arm64" ], "dev": true, - "license": "MIT", + "license": "Apache-2.0", "optional": true, "os": [ "darwin" ], "engines": { - "node": ">= 10" + "node": "^18.17.0 || ^20.3.0 || >=21.0.0" + }, + "funding": { + "url": "https://opencollective.com/libvips" + }, + "optionalDependencies": { + "@img/sharp-libvips-darwin-arm64": "1.0.4" } }, - "node_modules/@tailwindcss/oxide-darwin-x64": { - "version": "4.1.11", - "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-darwin-x64/-/oxide-darwin-x64-4.1.11.tgz", - "integrity": "sha512-EgnK8kRchgmgzG6jE10UQNaH9Mwi2n+yw1jWmof9Vyg2lpKNX2ioe7CJdf9M5f8V9uaQxInenZkOxnTVL3fhAw==", + "node_modules/@img/sharp-darwin-x64": { + "version": "0.33.5", + "resolved": "https://registry.npmjs.org/@img/sharp-darwin-x64/-/sharp-darwin-x64-0.33.5.tgz", + "integrity": "sha512-fyHac4jIc1ANYGRDxtiqelIbdWkIuQaI84Mv45KvGRRxSAa7o7d1ZKAOBaYbnepLC1WqxfpimdeWfvqqSGwR2Q==", "cpu": [ "x64" ], "dev": true, - "license": "MIT", + "license": "Apache-2.0", "optional": true, "os": [ "darwin" ], "engines": { - "node": ">= 10" + "node": "^18.17.0 || ^20.3.0 || >=21.0.0" + }, + "funding": { + "url": "https://opencollective.com/libvips" + }, + "optionalDependencies": { + "@img/sharp-libvips-darwin-x64": "1.0.4" } }, - "node_modules/@tailwindcss/oxide-freebsd-x64": { - "version": "4.1.11", - "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-freebsd-x64/-/oxide-freebsd-x64-4.1.11.tgz", - "integrity": "sha512-xdqKtbpHs7pQhIKmqVpxStnY1skuNh4CtbcyOHeX1YBE0hArj2romsFGb6yUmzkq/6M24nkxDqU8GYrKrz+UcA==", + "node_modules/@img/sharp-libvips-darwin-arm64": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/@img/sharp-libvips-darwin-arm64/-/sharp-libvips-darwin-arm64-1.0.4.tgz", + "integrity": "sha512-XblONe153h0O2zuFfTAbQYAX2JhYmDHeWikp1LM9Hul9gVPjFY427k6dFEcOL72O01QxQsWi761svJ/ev9xEDg==", "cpu": [ - "x64" + "arm64" ], "dev": true, - "license": "MIT", + "license": "LGPL-3.0-or-later", "optional": true, "os": [ - "freebsd" + "darwin" ], - "engines": { - "node": ">= 10" + "funding": { + "url": "https://opencollective.com/libvips" } }, - "node_modules/@tailwindcss/oxide-linux-arm-gnueabihf": { - "version": "4.1.11", - "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-linux-arm-gnueabihf/-/oxide-linux-arm-gnueabihf-4.1.11.tgz", - "integrity": "sha512-ryHQK2eyDYYMwB5wZL46uoxz2zzDZsFBwfjssgB7pzytAeCCa6glsiJGjhTEddq/4OsIjsLNMAiMlHNYnkEEeg==", + "node_modules/@img/sharp-libvips-darwin-x64": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/@img/sharp-libvips-darwin-x64/-/sharp-libvips-darwin-x64-1.0.4.tgz", + "integrity": "sha512-xnGR8YuZYfJGmWPvmlunFaWJsb9T/AO2ykoP3Fz/0X5XV2aoYBPkX6xqCQvUTKKiLddarLaxpzNe+b1hjeWHAQ==", "cpu": [ - "arm" + "x64" ], "dev": true, - "license": "MIT", + "license": "LGPL-3.0-or-later", "optional": true, "os": [ - "linux" + "darwin" ], - "engines": { - "node": ">= 10" + "funding": { + "url": "https://opencollective.com/libvips" } }, - "node_modules/@tailwindcss/oxide-linux-arm64-gnu": { - "version": "4.1.11", - "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-linux-arm64-gnu/-/oxide-linux-arm64-gnu-4.1.11.tgz", - "integrity": "sha512-mYwqheq4BXF83j/w75ewkPJmPZIqqP1nhoghS9D57CLjsh3Nfq0m4ftTotRYtGnZd3eCztgbSPJ9QhfC91gDZQ==", + "node_modules/@img/sharp-libvips-linux-arm": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-arm/-/sharp-libvips-linux-arm-1.0.5.tgz", + "integrity": "sha512-gvcC4ACAOPRNATg/ov8/MnbxFDJqf/pDePbBnuBDcjsI8PssmjoKMAz4LtLaVi+OnSb5FK/yIOamqDwGmXW32g==", "cpu": [ - "arm64" + "arm" ], "dev": true, - "license": "MIT", + "license": "LGPL-3.0-or-later", "optional": true, "os": [ "linux" ], - "engines": { - "node": ">= 10" + "funding": { + "url": "https://opencollective.com/libvips" } }, - "node_modules/@tailwindcss/oxide-linux-arm64-musl": { - "version": "4.1.11", - "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-linux-arm64-musl/-/oxide-linux-arm64-musl-4.1.11.tgz", - "integrity": "sha512-m/NVRFNGlEHJrNVk3O6I9ggVuNjXHIPoD6bqay/pubtYC9QIdAMpS+cswZQPBLvVvEF6GtSNONbDkZrjWZXYNQ==", + "node_modules/@img/sharp-libvips-linux-arm64": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-arm64/-/sharp-libvips-linux-arm64-1.0.4.tgz", + "integrity": "sha512-9B+taZ8DlyyqzZQnoeIvDVR/2F4EbMepXMc/NdVbkzsJbzkUjhXv/70GQJ7tdLA4YJgNP25zukcxpX2/SueNrA==", "cpu": [ "arm64" ], "dev": true, - "license": "MIT", + "license": "LGPL-3.0-or-later", "optional": true, "os": [ "linux" ], - "engines": { - "node": ">= 10" + "funding": { + "url": "https://opencollective.com/libvips" } }, - "node_modules/@tailwindcss/oxide-linux-x64-gnu": { - "version": "4.1.11", - "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-linux-x64-gnu/-/oxide-linux-x64-gnu-4.1.11.tgz", - "integrity": "sha512-YW6sblI7xukSD2TdbbaeQVDysIm/UPJtObHJHKxDEcW2exAtY47j52f8jZXkqE1krdnkhCMGqP3dbniu1Te2Fg==", + "node_modules/@img/sharp-libvips-linux-s390x": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-s390x/-/sharp-libvips-linux-s390x-1.0.4.tgz", + "integrity": "sha512-u7Wz6ntiSSgGSGcjZ55im6uvTrOxSIS8/dgoVMoiGE9I6JAfU50yH5BoDlYA1tcuGS7g/QNtetJnxA6QEsCVTA==", "cpu": [ - "x64" + "s390x" ], "dev": true, - "license": "MIT", + "license": "LGPL-3.0-or-later", "optional": true, "os": [ "linux" ], - "engines": { - "node": ">= 10" + "funding": { + "url": "https://opencollective.com/libvips" } }, - "node_modules/@tailwindcss/oxide-linux-x64-musl": { - "version": "4.1.11", - "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-linux-x64-musl/-/oxide-linux-x64-musl-4.1.11.tgz", - "integrity": "sha512-e3C/RRhGunWYNC3aSF7exsQkdXzQ/M+aYuZHKnw4U7KQwTJotnWsGOIVih0s2qQzmEzOFIJ3+xt7iq67K/p56Q==", + "node_modules/@img/sharp-libvips-linux-x64": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linux-x64/-/sharp-libvips-linux-x64-1.0.4.tgz", + "integrity": "sha512-MmWmQ3iPFZr0Iev+BAgVMb3ZyC4KeFc3jFxnNbEPas60e1cIfevbtuyf9nDGIzOaW9PdnDciJm+wFFaTlj5xYw==", "cpu": [ "x64" ], "dev": true, - "license": "MIT", + "license": "LGPL-3.0-or-later", "optional": true, "os": [ "linux" ], - "engines": { - "node": ">= 10" + "funding": { + "url": "https://opencollective.com/libvips" } }, - "node_modules/@tailwindcss/oxide-wasm32-wasi": { - "version": "4.1.11", - "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-wasm32-wasi/-/oxide-wasm32-wasi-4.1.11.tgz", - "integrity": "sha512-Xo1+/GU0JEN/C/dvcammKHzeM6NqKovG+6921MR6oadee5XPBaKOumrJCXvopJ/Qb5TH7LX/UAywbqrP4lax0g==", - "bundleDependencies": [ - "@napi-rs/wasm-runtime", - "@emnapi/core", - "@emnapi/runtime", - "@tybys/wasm-util", - "@emnapi/wasi-threads", - "tslib" - ], + "node_modules/@img/sharp-libvips-linuxmusl-arm64": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linuxmusl-arm64/-/sharp-libvips-linuxmusl-arm64-1.0.4.tgz", + "integrity": "sha512-9Ti+BbTYDcsbp4wfYib8Ctm1ilkugkA/uscUn6UXK1ldpC1JjiXbLfFZtRlBhjPZ5o1NCLiDbg8fhUPKStHoTA==", "cpu": [ - "wasm32" + "arm64" ], "dev": true, - "license": "MIT", - "optional": true, - "dependencies": { - "@emnapi/core": "^1.4.3", - "@emnapi/runtime": "^1.4.3", - "@emnapi/wasi-threads": "^1.0.2", - "@napi-rs/wasm-runtime": "^0.2.11", - "@tybys/wasm-util": "^0.9.0", - "tslib": "^2.8.0" - }, - "engines": { - "node": ">=14.0.0" - } - }, - "node_modules/@tailwindcss/oxide-wasm32-wasi/node_modules/@emnapi/core": { - "version": "1.4.3", - "dev": true, - "inBundle": true, - "license": "MIT", - "optional": true, - "dependencies": { - "@emnapi/wasi-threads": "1.0.2", - "tslib": "^2.4.0" - } - }, - "node_modules/@tailwindcss/oxide-wasm32-wasi/node_modules/@emnapi/runtime": { - "version": "1.4.3", - "dev": true, - "inBundle": true, - "license": "MIT", + "license": "LGPL-3.0-or-later", "optional": true, - "dependencies": { - "tslib": "^2.4.0" + "os": [ + "linux" + ], + "funding": { + "url": "https://opencollective.com/libvips" } }, - "node_modules/@tailwindcss/oxide-wasm32-wasi/node_modules/@emnapi/wasi-threads": { - "version": "1.0.2", + "node_modules/@img/sharp-libvips-linuxmusl-x64": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/@img/sharp-libvips-linuxmusl-x64/-/sharp-libvips-linuxmusl-x64-1.0.4.tgz", + "integrity": "sha512-viYN1KX9m+/hGkJtvYYp+CCLgnJXwiQB39damAO7WMdKWlIhmYTfHjwSbQeUK/20vY154mwezd9HflVFM1wVSw==", + "cpu": [ + "x64" + ], "dev": true, - "inBundle": true, - "license": "MIT", + "license": "LGPL-3.0-or-later", "optional": true, - "dependencies": { - "tslib": "^2.4.0" + "os": [ + "linux" + ], + "funding": { + "url": "https://opencollective.com/libvips" } }, - "node_modules/@tailwindcss/oxide-wasm32-wasi/node_modules/@napi-rs/wasm-runtime": { - "version": "0.2.11", + "node_modules/@img/sharp-linux-arm": { + "version": "0.33.5", + "resolved": "https://registry.npmjs.org/@img/sharp-linux-arm/-/sharp-linux-arm-0.33.5.tgz", + "integrity": "sha512-JTS1eldqZbJxjvKaAkxhZmBqPRGmxgu+qFKSInv8moZ2AmT5Yib3EQ1c6gp493HvrvV8QgdOXdyaIBrhvFhBMQ==", + "cpu": [ + "arm" + ], "dev": true, - "inBundle": true, - "license": "MIT", + "license": "Apache-2.0", "optional": true, - "dependencies": { - "@emnapi/core": "^1.4.3", - "@emnapi/runtime": "^1.4.3", - "@tybys/wasm-util": "^0.9.0" + "os": [ + "linux" + ], + "engines": { + "node": "^18.17.0 || ^20.3.0 || >=21.0.0" + }, + "funding": { + "url": "https://opencollective.com/libvips" + }, + "optionalDependencies": { + "@img/sharp-libvips-linux-arm": "1.0.5" } }, - "node_modules/@tailwindcss/oxide-wasm32-wasi/node_modules/@tybys/wasm-util": { - "version": "0.9.0", + "node_modules/@img/sharp-linux-arm64": { + "version": "0.33.5", + "resolved": "https://registry.npmjs.org/@img/sharp-linux-arm64/-/sharp-linux-arm64-0.33.5.tgz", + "integrity": "sha512-JMVv+AMRyGOHtO1RFBiJy/MBsgz0x4AWrT6QoEVVTyh1E39TrCUpTRI7mx9VksGX4awWASxqCYLCV4wBZHAYxA==", + "cpu": [ + "arm64" + ], "dev": true, - "inBundle": true, - "license": "MIT", + "license": "Apache-2.0", "optional": true, - "dependencies": { - "tslib": "^2.4.0" + "os": [ + "linux" + ], + "engines": { + "node": "^18.17.0 || ^20.3.0 || >=21.0.0" + }, + "funding": { + "url": "https://opencollective.com/libvips" + }, + "optionalDependencies": { + "@img/sharp-libvips-linux-arm64": "1.0.4" } }, - "node_modules/@tailwindcss/oxide-wasm32-wasi/node_modules/tslib": { - "version": "2.8.0", - "dev": true, - "inBundle": true, - "license": "0BSD", - "optional": true - }, - "node_modules/@tailwindcss/oxide-win32-arm64-msvc": { - "version": "4.1.11", - "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-win32-arm64-msvc/-/oxide-win32-arm64-msvc-4.1.11.tgz", - "integrity": "sha512-UgKYx5PwEKrac3GPNPf6HVMNhUIGuUh4wlDFR2jYYdkX6pL/rn73zTq/4pzUm8fOjAn5L8zDeHp9iXmUGOXZ+w==", + "node_modules/@img/sharp-linux-s390x": { + "version": "0.33.5", + "resolved": "https://registry.npmjs.org/@img/sharp-linux-s390x/-/sharp-linux-s390x-0.33.5.tgz", + "integrity": "sha512-y/5PCd+mP4CA/sPDKl2961b+C9d+vPAveS33s6Z3zfASk2j5upL6fXVPZi7ztePZ5CuH+1kW8JtvxgbuXHRa4Q==", "cpu": [ - "arm64" + "s390x" ], "dev": true, - "license": "MIT", + "license": "Apache-2.0", "optional": true, "os": [ - "win32" + "linux" ], "engines": { - "node": ">= 10" + "node": "^18.17.0 || ^20.3.0 || >=21.0.0" + }, + "funding": { + "url": "https://opencollective.com/libvips" + }, + "optionalDependencies": { + "@img/sharp-libvips-linux-s390x": "1.0.4" } }, - "node_modules/@tailwindcss/oxide-win32-x64-msvc": { - "version": "4.1.11", - "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-win32-x64-msvc/-/oxide-win32-x64-msvc-4.1.11.tgz", - "integrity": "sha512-YfHoggn1j0LK7wR82TOucWc5LDCguHnoS879idHekmmiR7g9HUtMw9MI0NHatS28u/Xlkfi9w5RJWgz2Dl+5Qg==", + "node_modules/@img/sharp-linux-x64": { + "version": "0.33.5", + "resolved": "https://registry.npmjs.org/@img/sharp-linux-x64/-/sharp-linux-x64-0.33.5.tgz", + "integrity": "sha512-opC+Ok5pRNAzuvq1AG0ar+1owsu842/Ab+4qvU879ippJBHvyY5n2mxF1izXqkPYlGuP/M556uh53jRLJmzTWA==", "cpu": [ "x64" ], "dev": true, - "license": "MIT", + "license": "Apache-2.0", "optional": true, "os": [ - "win32" + "linux" ], "engines": { - "node": ">= 10" - } - }, - "node_modules/@tailwindcss/typography": { - "version": "0.5.16", - "resolved": "https://registry.npmjs.org/@tailwindcss/typography/-/typography-0.5.16.tgz", - "integrity": "sha512-0wDLwCVF5V3x3b1SGXPCDcdsbDHMBe+lkFzBRaHeLvNi+nrrnZ1lA18u+OTWO8iSWU2GxUOCvlXtDuqftc1oiA==", - "dev": true, - "license": "MIT", - "dependencies": { - "lodash.castarray": "^4.4.0", - "lodash.isplainobject": "^4.0.6", - "lodash.merge": "^4.6.2", - "postcss-selector-parser": "6.0.10" + "node": "^18.17.0 || ^20.3.0 || >=21.0.0" }, - "peerDependencies": { - "tailwindcss": ">=3.0.0 || insiders || >=4.0.0-alpha.20 || >=4.0.0-beta.1" + "funding": { + "url": "https://opencollective.com/libvips" + }, + "optionalDependencies": { + "@img/sharp-libvips-linux-x64": "1.0.4" } }, - "node_modules/@tailwindcss/vite": { - "version": "4.1.11", - "resolved": "https://registry.npmjs.org/@tailwindcss/vite/-/vite-4.1.11.tgz", - "integrity": "sha512-RHYhrR3hku0MJFRV+fN2gNbDNEh3dwKvY8XJvTxCSXeMOsCRSr+uKvDWQcbizrHgjML6ZmTE5OwMrl5wKcujCw==", + "node_modules/@img/sharp-linuxmusl-arm64": { + "version": "0.33.5", + "resolved": "https://registry.npmjs.org/@img/sharp-linuxmusl-arm64/-/sharp-linuxmusl-arm64-0.33.5.tgz", + "integrity": "sha512-XrHMZwGQGvJg2V/oRSUfSAfjfPxO+4DkiRh6p2AFjLQztWUuY/o8Mq0eMQVIY7HJ1CDQUJlxGGZRw1a5bqmd1g==", + "cpu": [ + "arm64" + ], "dev": true, - "license": "MIT", - "dependencies": { - "@tailwindcss/node": "4.1.11", - "@tailwindcss/oxide": "4.1.11", - "tailwindcss": "4.1.11" + "license": "Apache-2.0", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": "^18.17.0 || ^20.3.0 || >=21.0.0" }, - "peerDependencies": { - "vite": "^5.2.0 || ^6 || ^7" + "funding": { + "url": "https://opencollective.com/libvips" + }, + "optionalDependencies": { + "@img/sharp-libvips-linuxmusl-arm64": "1.0.4" } }, - "node_modules/@testing-library/dom": { - "version": "10.4.1", - "resolved": "https://registry.npmjs.org/@testing-library/dom/-/dom-10.4.1.tgz", - "integrity": "sha512-o4PXJQidqJl82ckFaXUeoAW+XysPLauYI43Abki5hABd853iMhitooc6znOnczgbTYmEP6U6/y1ZyKAIsvMKGg==", + "node_modules/@img/sharp-linuxmusl-x64": { + "version": "0.33.5", + "resolved": "https://registry.npmjs.org/@img/sharp-linuxmusl-x64/-/sharp-linuxmusl-x64-0.33.5.tgz", + "integrity": "sha512-WT+d/cgqKkkKySYmqoZ8y3pxx7lx9vVejxW/W4DOFMYVSkErR+w7mf2u8m/y4+xHe7yY9DAXQMWQhpnMuFfScw==", + "cpu": [ + "x64" + ], "dev": true, - "license": "MIT", - "peer": true, - "dependencies": { - "@babel/code-frame": "^7.10.4", - "@babel/runtime": "^7.12.5", - "@types/aria-query": "^5.0.1", - "aria-query": "5.3.0", - "dom-accessibility-api": "^0.5.9", - "lz-string": "^1.5.0", - "picocolors": "1.1.1", - "pretty-format": "^27.0.2" - }, + "license": "Apache-2.0", + "optional": true, + "os": [ + "linux" + ], "engines": { - "node": ">=18" + "node": "^18.17.0 || ^20.3.0 || >=21.0.0" + }, + "funding": { + "url": "https://opencollective.com/libvips" + }, + "optionalDependencies": { + "@img/sharp-libvips-linuxmusl-x64": "1.0.4" } }, - "node_modules/@testing-library/jest-dom": { - "version": "6.9.1", - "resolved": "https://registry.npmjs.org/@testing-library/jest-dom/-/jest-dom-6.9.1.tgz", - "integrity": "sha512-zIcONa+hVtVSSep9UT3jZ5rizo2BsxgyDYU7WFD5eICBE7no3881HGeb/QkGfsJs6JTkY1aQhT7rIPC7e+0nnA==", + "node_modules/@img/sharp-wasm32": { + "version": "0.33.5", + "resolved": "https://registry.npmjs.org/@img/sharp-wasm32/-/sharp-wasm32-0.33.5.tgz", + "integrity": "sha512-ykUW4LVGaMcU9lu9thv85CbRMAwfeadCJHRsg2GmeRa/cJxsVY9Rbd57JcMxBkKHag5U/x7TSBpScF4U8ElVzg==", + "cpu": [ + "wasm32" + ], "dev": true, - "license": "MIT", + "license": "Apache-2.0 AND LGPL-3.0-or-later AND MIT", + "optional": true, "dependencies": { - "@adobe/css-tools": "^4.4.0", - "aria-query": "^5.0.0", - "css.escape": "^1.5.1", - "dom-accessibility-api": "^0.6.3", - "picocolors": "^1.1.1", - "redent": "^3.0.0" + "@emnapi/runtime": "^1.2.0" }, "engines": { - "node": ">=14", - "npm": ">=6", - "yarn": ">=1" + "node": "^18.17.0 || ^20.3.0 || >=21.0.0" + }, + "funding": { + "url": "https://opencollective.com/libvips" } }, - "node_modules/@testing-library/jest-dom/node_modules/dom-accessibility-api": { - "version": "0.6.3", - "resolved": "https://registry.npmjs.org/dom-accessibility-api/-/dom-accessibility-api-0.6.3.tgz", - "integrity": "sha512-7ZgogeTnjuHbo+ct10G9Ffp0mif17idi0IyWNVA/wcwcm7NPOD/WEHVP3n7n3MhXqxoIYm8d6MuZohYWIZ4T3w==", - "dev": true, - "license": "MIT" - }, - "node_modules/@testing-library/svelte-core": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/@testing-library/svelte-core/-/svelte-core-1.0.0.tgz", - "integrity": "sha512-VkUePoLV6oOYwSUvX6ShA8KLnJqZiYMIbP2JW2t0GLWLkJxKGvuH5qrrZBV/X7cXFnLGuFQEC7RheYiZOW68KQ==", + "node_modules/@img/sharp-win32-ia32": { + "version": "0.33.5", + "resolved": "https://registry.npmjs.org/@img/sharp-win32-ia32/-/sharp-win32-ia32-0.33.5.tgz", + "integrity": "sha512-T36PblLaTwuVJ/zw/LaH0PdZkRz5rd3SmMHX8GSmR7vtNSP5Z6bQkExdSK7xGWyxLw4sUknBuugTelgw2faBbQ==", + "cpu": [ + "ia32" + ], "dev": true, - "license": "MIT", + "license": "Apache-2.0 AND LGPL-3.0-or-later", + "optional": true, + "os": [ + "win32" + ], "engines": { - "node": ">=16" + "node": "^18.17.0 || ^20.3.0 || >=21.0.0" }, - "peerDependencies": { - "svelte": "^3 || ^4 || ^5 || ^5.0.0-next.0" + "funding": { + "url": "https://opencollective.com/libvips" } }, - "node_modules/@testing-library/user-event": { - "version": "14.6.1", - "resolved": "https://registry.npmjs.org/@testing-library/user-event/-/user-event-14.6.1.tgz", - "integrity": "sha512-vq7fv0rnt+QTXgPxr5Hjc210p6YKq2kmdziLgnsZGgLJ9e6VAShx1pACLuRjd/AS/sr7phAR58OIIpf0LlmQNw==", + "node_modules/@img/sharp-win32-x64": { + "version": "0.33.5", + "resolved": "https://registry.npmjs.org/@img/sharp-win32-x64/-/sharp-win32-x64-0.33.5.tgz", + "integrity": "sha512-MpY/o8/8kj+EcnxwvrP4aTJSWw/aZ7JIGR4aBeZkZw5B7/Jn+tY9/VNwtcoGmdT7GfggGIU4kygOMSbYnOrAbg==", + "cpu": [ + "x64" + ], "dev": true, - "license": "MIT", + "license": "Apache-2.0 AND LGPL-3.0-or-later", + "optional": true, + "os": [ + "win32" + ], "engines": { - "node": ">=12", - "npm": ">=6" + "node": "^18.17.0 || ^20.3.0 || >=21.0.0" }, - "peerDependencies": { - "@testing-library/dom": ">=7.21.4" + "funding": { + "url": "https://opencollective.com/libvips" } }, - "node_modules/@types/aria-query": { - "version": "5.0.4", - "resolved": "https://registry.npmjs.org/@types/aria-query/-/aria-query-5.0.4.tgz", - "integrity": "sha512-rfT93uj5s0PRL7EzccGMs3brplhcrghnDoV26NqKhCAS1hVo+WdNsPvE/yb6ilfr5hi2MEk6d5EWJTKdxg8jVw==", - "dev": true, - "license": "MIT", - "peer": true - }, - "node_modules/@types/chai": { - "version": "5.2.2", - "resolved": "https://registry.npmjs.org/@types/chai/-/chai-5.2.2.tgz", - "integrity": "sha512-8kB30R7Hwqf40JPiKhVzodJs2Qc1ZJ5zuT3uzw5Hq/dhNCl3G3l83jfpdI1e20BP348+fV7VIL/+FxaXkqBmWg==", + "node_modules/@internationalized/date": { + "version": "3.10.1", + "resolved": "https://registry.npmjs.org/@internationalized/date/-/date-3.10.1.tgz", + "integrity": "sha512-oJrXtQiAXLvT9clCf1K4kxp3eKsQhIaZqxEyowkBcsvZDdZkbWrVmnGknxs5flTD0VGsxrxKgBCZty1EzoiMzA==", "dev": true, - "license": "MIT", + "license": "Apache-2.0", "dependencies": { - "@types/deep-eql": "*" + "@swc/helpers": "^0.5.0" } }, - "node_modules/@types/cookie": { - "version": "0.6.0", - "resolved": "https://registry.npmjs.org/@types/cookie/-/cookie-0.6.0.tgz", - "integrity": "sha512-4Kh9a6B2bQciAhf7FSuMRRkUWecJgJu9nPnx3yzpsfXX/c50REIqpHY4C82bXP90qrLtXtkDxTZosYO3UpOwlA==", + "node_modules/@isaacs/fs-minipass": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/@isaacs/fs-minipass/-/fs-minipass-4.0.1.tgz", + "integrity": "sha512-wgm9Ehl2jpeqP3zw/7mo3kRHFp5MEDhqAdwy1fTGkHAwnkGOVsgpvQhL8B5n1qlb01jV3n/bI0ZfZp5lWA1k4w==", "dev": true, - "license": "MIT" + "license": "ISC", + "dependencies": { + "minipass": "^7.0.4" + }, + "engines": { + "node": ">=18.0.0" + } }, - "node_modules/@types/d3": { - "version": "7.4.3", - "resolved": "https://registry.npmjs.org/@types/d3/-/d3-7.4.3.tgz", - "integrity": "sha512-lZXZ9ckh5R8uiFVt8ogUNf+pIrK4EsWrx2Np75WvF/eTpJ0FMHNhjXk8CKEx/+gpHbNQyJWehbFaTvqmHWB3ww==", + "node_modules/@jridgewell/gen-mapping": { + "version": "0.3.13", + "resolved": "https://registry.npmjs.org/@jridgewell/gen-mapping/-/gen-mapping-0.3.13.tgz", + "integrity": "sha512-2kkt/7niJ6MgEPxF0bYdQ6etZaA+fQvDcLKckhy1yIQOzaoKjBBjSj63/aLVjYE3qhRt5dvM+uUyfCg6UKCBbA==", "dev": true, "license": "MIT", "dependencies": { - "@types/d3-array": "*", - "@types/d3-axis": "*", - "@types/d3-brush": "*", - "@types/d3-chord": "*", - "@types/d3-color": "*", - "@types/d3-contour": "*", - "@types/d3-delaunay": "*", - "@types/d3-dispatch": "*", - "@types/d3-drag": "*", - "@types/d3-dsv": "*", - "@types/d3-ease": "*", - "@types/d3-fetch": "*", - "@types/d3-force": "*", - "@types/d3-format": "*", - "@types/d3-geo": "*", - "@types/d3-hierarchy": "*", - "@types/d3-interpolate": "*", - "@types/d3-path": "*", - "@types/d3-polygon": "*", - "@types/d3-quadtree": "*", - "@types/d3-random": "*", - "@types/d3-scale": "*", - "@types/d3-scale-chromatic": "*", - "@types/d3-selection": "*", - "@types/d3-shape": "*", - "@types/d3-time": "*", - "@types/d3-time-format": "*", - "@types/d3-timer": "*", - "@types/d3-transition": "*", - "@types/d3-zoom": "*" + "@jridgewell/sourcemap-codec": "^1.5.0", + "@jridgewell/trace-mapping": "^0.3.24" } }, - "node_modules/@types/d3-array": { - "version": "3.2.2", - "resolved": "https://registry.npmjs.org/@types/d3-array/-/d3-array-3.2.2.tgz", - "integrity": "sha512-hOLWVbm7uRza0BYXpIIW5pxfrKe0W+D5lrFiAEYR+pb6w3N2SwSMaJbXdUfSEv+dT4MfHBLtn5js0LAWaO6otw==", - "dev": true, - "license": "MIT" - }, - "node_modules/@types/d3-axis": { - "version": "3.0.6", - "resolved": "https://registry.npmjs.org/@types/d3-axis/-/d3-axis-3.0.6.tgz", - "integrity": "sha512-pYeijfZuBd87T0hGn0FO1vQ/cgLk6E1ALJjfkC0oJ8cbwkZl3TpgS8bVBLZN+2jjGgg38epgxb2zmoGtSfvgMw==", + "node_modules/@jridgewell/remapping": { + "version": "2.3.5", + "resolved": "https://registry.npmjs.org/@jridgewell/remapping/-/remapping-2.3.5.tgz", + "integrity": "sha512-LI9u/+laYG4Ds1TDKSJW2YPrIlcVYOwi2fUC6xB43lueCjgxV4lffOCZCtYFiH6TNOX+tQKXx97T4IKHbhyHEQ==", "dev": true, "license": "MIT", "dependencies": { - "@types/d3-selection": "*" + "@jridgewell/gen-mapping": "^0.3.5", + "@jridgewell/trace-mapping": "^0.3.24" } }, - "node_modules/@types/d3-brush": { - "version": "3.0.6", - "resolved": "https://registry.npmjs.org/@types/d3-brush/-/d3-brush-3.0.6.tgz", - "integrity": "sha512-nH60IZNNxEcrh6L1ZSMNA28rj27ut/2ZmI3r96Zd+1jrZD++zD3LsMIjWlvg4AYrHn/Pqz4CF3veCxGjtbqt7A==", + "node_modules/@jridgewell/resolve-uri": { + "version": "3.1.2", + "resolved": "https://registry.npmjs.org/@jridgewell/resolve-uri/-/resolve-uri-3.1.2.tgz", + "integrity": "sha512-bRISgCIjP20/tbWSPWMEi54QVPRZExkuD9lJL+UIxUKtwVJA8wW1Trb1jMs1RFXo1CBTNZ/5hpC9QvmKWdopKw==", "dev": true, "license": "MIT", - "dependencies": { - "@types/d3-selection": "*" + "engines": { + "node": ">=6.0.0" } }, - "node_modules/@types/d3-chord": { - "version": "3.0.6", - "resolved": "https://registry.npmjs.org/@types/d3-chord/-/d3-chord-3.0.6.tgz", - "integrity": "sha512-LFYWWd8nwfwEmTZG9PfQxd17HbNPksHBiJHaKuY1XeqscXacsS2tyoo6OdRsjf+NQYeB6XrNL3a25E3gH69lcg==", + "node_modules/@jridgewell/source-map": { + "version": "0.3.11", + "resolved": "https://registry.npmjs.org/@jridgewell/source-map/-/source-map-0.3.11.tgz", + "integrity": "sha512-ZMp1V8ZFcPG5dIWnQLr3NSI1MiCU7UETdS/A0G8V/XWHvJv3ZsFqutJn1Y5RPmAPX6F3BiE397OqveU/9NCuIA==", "dev": true, - "license": "MIT" + "license": "MIT", + "dependencies": { + "@jridgewell/gen-mapping": "^0.3.5", + "@jridgewell/trace-mapping": "^0.3.25" + } }, - "node_modules/@types/d3-color": { - "version": "3.1.3", - "resolved": "https://registry.npmjs.org/@types/d3-color/-/d3-color-3.1.3.tgz", - "integrity": "sha512-iO90scth9WAbmgv7ogoq57O9YpKmFBbmoEoCHDB2xMBY0+/KVrqAaCDyCE16dUspeOvIxFFRI+0sEtqDqy2b4A==", + "node_modules/@jridgewell/sourcemap-codec": { + "version": "1.5.5", + "resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.5.5.tgz", + "integrity": "sha512-cYQ9310grqxueWbl+WuIUIaiUaDcj7WOq5fVhEljNVgRfOUhY9fy2zTvfoqWsnebh8Sl70VScFbICvJnLKB0Og==", "dev": true, "license": "MIT" }, - "node_modules/@types/d3-contour": { - "version": "3.0.6", - "resolved": "https://registry.npmjs.org/@types/d3-contour/-/d3-contour-3.0.6.tgz", - "integrity": "sha512-BjzLgXGnCWjUSYGfH1cpdo41/hgdWETu4YxpezoztawmqsvCeep+8QGfiY6YbDvfgHz/DkjeIkkZVJavB4a3rg==", + "node_modules/@jridgewell/trace-mapping": { + "version": "0.3.31", + "resolved": "https://registry.npmjs.org/@jridgewell/trace-mapping/-/trace-mapping-0.3.31.tgz", + "integrity": "sha512-zzNR+SdQSDJzc8joaeP8QQoCQr8NuYx2dIIytl1QeBEZHJ9uW6hebsrYgbz8hJwUQao3TWCMtmfV8Nu1twOLAw==", "dev": true, "license": "MIT", "dependencies": { - "@types/d3-array": "*", - "@types/geojson": "*" + "@jridgewell/resolve-uri": "^3.1.0", + "@jridgewell/sourcemap-codec": "^1.4.14" + } + }, + "node_modules/@lucide/svelte": { + "version": "0.515.0", + "resolved": "https://registry.npmjs.org/@lucide/svelte/-/svelte-0.515.0.tgz", + "integrity": "sha512-CEAyqcZmNBfYzVgaRmK2RFJP5tnbXxekRyDk0XX/eZQRfsJmkDvmQwXNX8C869BgNeryzmrRyjHhUL6g9ZOHNA==", + "dev": true, + "license": "ISC", + "peerDependencies": { + "svelte": "^5" + } + }, + "node_modules/@mdx-js/react": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/@mdx-js/react/-/react-3.1.1.tgz", + "integrity": "sha512-f++rKLQgUVYDAtECQ6fn/is15GkEH9+nZPM3MS0RcxVqoTfawHvDlSCH7JbMhAM6uJ32v3eXLvLmLvjGu7PTQw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/mdx": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + }, + "peerDependencies": { + "@types/react": ">=16", + "react": ">=16" + } + }, + "node_modules/@mermaid-js/parser": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/@mermaid-js/parser/-/parser-1.1.1.tgz", + "integrity": "sha512-VuHdsYMK1bT6X2JbcAaWAhugTRvRBRyuZgd+c22swUeI9g/ntaxF7CY7dYarhZovofCbUNO0G7JesfmNtjYOCw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@chevrotain/types": "~11.1.1" + } + }, + "node_modules/@modelcontextprotocol/sdk": { + "version": "1.26.0", + "resolved": "https://registry.npmjs.org/@modelcontextprotocol/sdk/-/sdk-1.26.0.tgz", + "integrity": "sha512-Y5RmPncpiDtTXDbLKswIJzTqu2hyBKxTNsgKqKclDbhIgg1wgtf1fRuvxgTnRfcnxtvvgbIEcqUOzZrJ6iSReg==", + "dev": true, + "license": "MIT", + "dependencies": { + "@hono/node-server": "^1.19.9", + "ajv": "^8.17.1", + "ajv-formats": "^3.0.1", + "content-type": "^1.0.5", + "cors": "^2.8.5", + "cross-spawn": "^7.0.5", + "eventsource": "^3.0.2", + "eventsource-parser": "^3.0.0", + "express": "^5.2.1", + "express-rate-limit": "^8.2.1", + "hono": "^4.11.4", + "jose": "^6.1.3", + "json-schema-typed": "^8.0.2", + "pkce-challenge": "^5.0.0", + "raw-body": "^3.0.0", + "zod": "^3.25 || ^4.0", + "zod-to-json-schema": "^3.25.1" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "@cfworker/json-schema": "^4.1.1", + "zod": "^3.25 || ^4.0" + }, + "peerDependenciesMeta": { + "@cfworker/json-schema": { + "optional": true + }, + "zod": { + "optional": false + } + } + }, + "node_modules/@napi-rs/canvas": { + "version": "0.1.100", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas/-/canvas-0.1.100.tgz", + "integrity": "sha512-xglYA6q3XO5P3BNJYxVZ1IV7DLVjp1Py6nwag88YntrS+3vKHyYcMqXVS4ZztJmwz2uGvz1FWhI/4LgbR5uQDA==", + "dev": true, + "license": "MIT", + "optional": true, + "workspaces": [ + "e2e/*" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + }, + "optionalDependencies": { + "@napi-rs/canvas-android-arm64": "0.1.100", + "@napi-rs/canvas-darwin-arm64": "0.1.100", + "@napi-rs/canvas-darwin-x64": "0.1.100", + "@napi-rs/canvas-linux-arm-gnueabihf": "0.1.100", + "@napi-rs/canvas-linux-arm64-gnu": "0.1.100", + "@napi-rs/canvas-linux-arm64-musl": "0.1.100", + "@napi-rs/canvas-linux-riscv64-gnu": "0.1.100", + "@napi-rs/canvas-linux-x64-gnu": "0.1.100", + "@napi-rs/canvas-linux-x64-musl": "0.1.100", + "@napi-rs/canvas-win32-arm64-msvc": "0.1.100", + "@napi-rs/canvas-win32-x64-msvc": "0.1.100" + } + }, + "node_modules/@napi-rs/canvas-android-arm64": { + "version": "0.1.100", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-android-arm64/-/canvas-android-arm64-0.1.100.tgz", + "integrity": "sha512-hjhCKhntPv9+t4ckHymdx0phYNcVW+GKQR6Lzw2zE+pOVjOplSmtx9nNNknTjbEDLcuLZqA1y8ufKg1XfgftzQ==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + } + }, + "node_modules/@napi-rs/canvas-darwin-arm64": { + "version": "0.1.100", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-darwin-arm64/-/canvas-darwin-arm64-0.1.100.tgz", + "integrity": "sha512-2PcswRaC7Ly645DGt88///zuFDhJxJYdKAs1uU3mfk1atYkXufgcgLfBpk6Tm12nCQBaNt1wpybuPZ4qOhTo8A==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + } + }, + "node_modules/@napi-rs/canvas-darwin-x64": { + "version": "0.1.100", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-darwin-x64/-/canvas-darwin-x64-0.1.100.tgz", + "integrity": "sha512-ePNZtj7pNIva/siZMg+HmbeozkIjqUIYdoymH8HaA3qK7LfzFN4WMBM8G6HQ9ZC+H3+Dnn5pqtiXpgLykaPOhw==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + } + }, + "node_modules/@napi-rs/canvas-linux-arm-gnueabihf": { + "version": "0.1.100", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm-gnueabihf/-/canvas-linux-arm-gnueabihf-0.1.100.tgz", + "integrity": "sha512-d5cDB48oWFGU8/XPhUOFAlySgb/VAu7D+s8fi55K1Pcfg8aPplHWqMgibhVLU8ky7Pyg/fuiVLz4Nf3JrSTuUA==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + } + }, + "node_modules/@napi-rs/canvas-linux-arm64-gnu": { + "version": "0.1.100", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm64-gnu/-/canvas-linux-arm64-gnu-0.1.100.tgz", + "integrity": "sha512-rDxgxRu69RvDlX/bh9o22DxLsGr8EqsNgotL9+RwQE1S0b0cqeatqsw6aW45mukm0B42DIAaAacKaYQ8cqS1nw==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + } + }, + "node_modules/@napi-rs/canvas-linux-arm64-musl": { + "version": "0.1.100", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm64-musl/-/canvas-linux-arm64-musl-0.1.100.tgz", + "integrity": "sha512-K3mDW66N+xT2/V439u1alFANiBUjdEx2gLiNYnCmUsva5jZMxWTjafBYwTzYK+EMFMHrUoabuU+T1BIP5CgbYQ==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + } + }, + "node_modules/@napi-rs/canvas-linux-riscv64-gnu": { + "version": "0.1.100", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-riscv64-gnu/-/canvas-linux-riscv64-gnu-0.1.100.tgz", + "integrity": "sha512-mooqUBTIsccZpnoQC4NgrC1v6C1vof39etLNMnBwCY+p0gajWJvAHLGQ6g/gGyS5YrpDW+GefSN4+Cvcr08UWw==", + "cpu": [ + "riscv64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + } + }, + "node_modules/@napi-rs/canvas-linux-x64-gnu": { + "version": "0.1.100", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-x64-gnu/-/canvas-linux-x64-gnu-0.1.100.tgz", + "integrity": "sha512-1eCvkDCazm7FFhsT7DfGOdSaHgZVK3bt/dSBl5EWHOWmnz+I7j8tPseJqqD81NF+MH21jKUK4wQSDjN0mdhnTg==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + } + }, + "node_modules/@napi-rs/canvas-linux-x64-musl": { + "version": "0.1.100", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-x64-musl/-/canvas-linux-x64-musl-0.1.100.tgz", + "integrity": "sha512-20arT6lnI19S68qNlii73TSEDbECNgzMz2EpldC1V3mZFuRkeujXkcebRk0LRJe9SEUAooYiLokfMViY8IX7yA==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + } + }, + "node_modules/@napi-rs/canvas-win32-arm64-msvc": { + "version": "0.1.100", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-win32-arm64-msvc/-/canvas-win32-arm64-msvc-0.1.100.tgz", + "integrity": "sha512-DZFFT1wIAg37LJw37yhMRFfjATd3vTQzjZ1Yki8u2vhO6Hi5VE6BVaGQ1aaDu7xb4iMErz+9EOwjpS7xcxFeBw==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + } + }, + "node_modules/@napi-rs/canvas-win32-x64-msvc": { + "version": "0.1.100", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-win32-x64-msvc/-/canvas-win32-x64-msvc-0.1.100.tgz", + "integrity": "sha512-MyT1j3mHC2+Lu4pBi9mKyMJhtP6U7k7EldY7sj/uS5gJA65gTXt8MefJQXLJo5d/vZbuWmfxzkEUNc/urV3pHA==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + } + }, + "node_modules/@napi-rs/wasm-runtime": { + "version": "1.1.4", + "resolved": "https://registry.npmjs.org/@napi-rs/wasm-runtime/-/wasm-runtime-1.1.4.tgz", + "integrity": "sha512-3NQNNgA1YSlJb/kMH1ildASP9HW7/7kYnRI2szWJaofaS1hWmbGI4H+d3+22aGzXXN9IJ+n+GiFVcGipJP18ow==", + "dev": true, + "license": "MIT", + "optional": true, + "dependencies": { + "@tybys/wasm-util": "^0.10.1" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + }, + "peerDependencies": { + "@emnapi/core": "^1.7.1", + "@emnapi/runtime": "^1.7.1" + } + }, + "node_modules/@neoconfetti/react": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/@neoconfetti/react/-/react-1.0.0.tgz", + "integrity": "sha512-klcSooChXXOzIm+SE5IISIAn3bYzYfPjbX7D7HoqZL84oAfgREeSg5vSIaSFH+DaGzzvImTyWe1OyrJ67vik4A==", + "dev": true, + "license": "MIT" + }, + "node_modules/@oxc-parser/binding-android-arm-eabi": { + "version": "0.127.0", + "resolved": "https://registry.npmjs.org/@oxc-parser/binding-android-arm-eabi/-/binding-android-arm-eabi-0.127.0.tgz", + "integrity": "sha512-0LC7ye4hvqbIKxAzThzvswgHLFu2AURKzYLeSVvLdu2TBOYWQDmHnTqPLeA597BcUCxiLqLsS4CJ5uoI5WYWCQ==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": "^20.19.0 || >=22.12.0" + } + }, + "node_modules/@oxc-parser/binding-android-arm64": { + "version": "0.127.0", + "resolved": "https://registry.npmjs.org/@oxc-parser/binding-android-arm64/-/binding-android-arm64-0.127.0.tgz", + "integrity": "sha512-b5jtVTH6AU5CJXHNdj7Jj9IEiR9yVjjnwHzPJhGyHGPdcsZSzBCkS9GBbV33niRMvKthDwQRFRJfI4a+k4PvYg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": "^20.19.0 || >=22.12.0" + } + }, + "node_modules/@oxc-parser/binding-darwin-arm64": { + "version": "0.127.0", + "resolved": "https://registry.npmjs.org/@oxc-parser/binding-darwin-arm64/-/binding-darwin-arm64-0.127.0.tgz", + "integrity": "sha512-obCE8B7ISKkJidjlhv9xRGJPOSDG2Yu6PRga9Ruaz35uintHxbp1Ki/Yc71wx4rj3Edrm0a1kzG1TAwit0wFpg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": "^20.19.0 || >=22.12.0" + } + }, + "node_modules/@oxc-parser/binding-darwin-x64": { + "version": "0.127.0", + "resolved": "https://registry.npmjs.org/@oxc-parser/binding-darwin-x64/-/binding-darwin-x64-0.127.0.tgz", + "integrity": "sha512-JL6Xb5IwPQT8rUzlpsX7E+AgfcdNklXNPFp8pjCQQ5MQOQo5rtEB2ui+3Hgg9Sn7Y9Egj6YOLLiHhLpdAe12Aw==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": "^20.19.0 || >=22.12.0" + } + }, + "node_modules/@oxc-parser/binding-freebsd-x64": { + "version": "0.127.0", + "resolved": "https://registry.npmjs.org/@oxc-parser/binding-freebsd-x64/-/binding-freebsd-x64-0.127.0.tgz", + "integrity": "sha512-SDQ/3MQFw58fqQz3Z1PhSKFF3JoCF4gmlNjziDm8X02tTahCw0qJbd7FGPDKw1i4VTBZene9JPyC3mHtSvi+wA==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ], + "engines": { + "node": "^20.19.0 || >=22.12.0" + } + }, + "node_modules/@oxc-parser/binding-linux-arm-gnueabihf": { + "version": "0.127.0", + "resolved": "https://registry.npmjs.org/@oxc-parser/binding-linux-arm-gnueabihf/-/binding-linux-arm-gnueabihf-0.127.0.tgz", + "integrity": "sha512-Av+D1MIqzV0YMGPT9we2SIZaMKD7Cxs4CvXSx/yxaWHewZjYEjScpOf5igc8IILASViw4WTnjlwUdI1KzVtDHQ==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": "^20.19.0 || >=22.12.0" + } + }, + "node_modules/@oxc-parser/binding-linux-arm-musleabihf": { + "version": "0.127.0", + "resolved": "https://registry.npmjs.org/@oxc-parser/binding-linux-arm-musleabihf/-/binding-linux-arm-musleabihf-0.127.0.tgz", + "integrity": "sha512-Cs2fdJ8cPpFdeebj6p4dag8A4+56hPvZ0AhQQzlaLswGz1tz7bXt1nETLeorrM9+AMcWFFkqxcXwDGfTVidY8g==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": "^20.19.0 || >=22.12.0" + } + }, + "node_modules/@oxc-parser/binding-linux-arm64-gnu": { + "version": "0.127.0", + "resolved": "https://registry.npmjs.org/@oxc-parser/binding-linux-arm64-gnu/-/binding-linux-arm64-gnu-0.127.0.tgz", + "integrity": "sha512-qdOfTcT6SY8gsJrrV92uyEUyjqMGPpIB5JZUG6QN5dukYd+7/j0kX6MwK1DgQj39jtUYixxPiaRUiEN1+0CXgQ==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": "^20.19.0 || >=22.12.0" + } + }, + "node_modules/@oxc-parser/binding-linux-arm64-musl": { + "version": "0.127.0", + "resolved": "https://registry.npmjs.org/@oxc-parser/binding-linux-arm64-musl/-/binding-linux-arm64-musl-0.127.0.tgz", + "integrity": "sha512-EoTCZneNFU/P2qrpEM+RHmQwt+CvDkyGESG6qhr7KaegXLZwePfbrkCDfAk8/rhxbDUVGsZILX+2tqPzFtoFWA==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": "^20.19.0 || >=22.12.0" + } + }, + "node_modules/@oxc-parser/binding-linux-ppc64-gnu": { + "version": "0.127.0", + "resolved": "https://registry.npmjs.org/@oxc-parser/binding-linux-ppc64-gnu/-/binding-linux-ppc64-gnu-0.127.0.tgz", + "integrity": "sha512-zALjmZYgxFLHjXeudcDF0xFGNydTAtkAeXAr2EuC17ywCyFxcmQra4w0BMde0Yi/re4Bi4iwEoEXtYN7l6eBLQ==", + "cpu": [ + "ppc64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": "^20.19.0 || >=22.12.0" + } + }, + "node_modules/@oxc-parser/binding-linux-riscv64-gnu": { + "version": "0.127.0", + "resolved": "https://registry.npmjs.org/@oxc-parser/binding-linux-riscv64-gnu/-/binding-linux-riscv64-gnu-0.127.0.tgz", + "integrity": "sha512-fPP8M6zQLS7Jz7o9d5ArUSuAuSK3e+WCYVrCpdzeCOejidtZExJ9tjhDrAd3HEPqARBCPmdpqxESPFqy44vkBQ==", + "cpu": [ + "riscv64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": "^20.19.0 || >=22.12.0" + } + }, + "node_modules/@oxc-parser/binding-linux-riscv64-musl": { + "version": "0.127.0", + "resolved": "https://registry.npmjs.org/@oxc-parser/binding-linux-riscv64-musl/-/binding-linux-riscv64-musl-0.127.0.tgz", + "integrity": "sha512-7IcC4Ao02oGpfnjt+X/oF4U2mllo2qoSkw5xxiXNKL9MCTsTiAC6616beOuehdxGcnz1bRoPC1RQ2f1GQDdN+g==", + "cpu": [ + "riscv64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": "^20.19.0 || >=22.12.0" + } + }, + "node_modules/@oxc-parser/binding-linux-s390x-gnu": { + "version": "0.127.0", + "resolved": "https://registry.npmjs.org/@oxc-parser/binding-linux-s390x-gnu/-/binding-linux-s390x-gnu-0.127.0.tgz", + "integrity": "sha512-pbXIhiNFHoqWeqDNLiJ9JkpHz1IM9k4DXa66x+1GTWMG7iLxtkXgE53iiuKSXwmk3zIYmaPVfBvgcAhS583K4Q==", + "cpu": [ + "s390x" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": "^20.19.0 || >=22.12.0" + } + }, + "node_modules/@oxc-parser/binding-linux-x64-gnu": { + "version": "0.127.0", + "resolved": "https://registry.npmjs.org/@oxc-parser/binding-linux-x64-gnu/-/binding-linux-x64-gnu-0.127.0.tgz", + "integrity": "sha512-MYCguB9RvBvlSd6gbuNI7QwiLoCCAlGnlRJFPrzLI6U1/9wkC/WK6LtBAUln55H1Ctqw45PWmqrobKoMhsYQzQ==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": "^20.19.0 || >=22.12.0" + } + }, + "node_modules/@oxc-parser/binding-linux-x64-musl": { + "version": "0.127.0", + "resolved": "https://registry.npmjs.org/@oxc-parser/binding-linux-x64-musl/-/binding-linux-x64-musl-0.127.0.tgz", + "integrity": "sha512-5eY0B/bxf1xIUxb4NOTvOI3KWtBQfPWYyKAzgcrCt0mDibSZygVpO1Pz8bkeiSZ5Jj9+M09dkggG3H8I5d0Uyg==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": "^20.19.0 || >=22.12.0" + } + }, + "node_modules/@oxc-parser/binding-openharmony-arm64": { + "version": "0.127.0", + "resolved": "https://registry.npmjs.org/@oxc-parser/binding-openharmony-arm64/-/binding-openharmony-arm64-0.127.0.tgz", + "integrity": "sha512-Gld0ajrFTUXNtdw20fVBuTQx66FA75nIVg+//pPfR3sXkuABB4mTBhl3r9JNzrJpgW//qiwxf0nWXUWGJSL3UQ==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "openharmony" + ], + "engines": { + "node": "^20.19.0 || >=22.12.0" + } + }, + "node_modules/@oxc-parser/binding-wasm32-wasi": { + "version": "0.127.0", + "resolved": "https://registry.npmjs.org/@oxc-parser/binding-wasm32-wasi/-/binding-wasm32-wasi-0.127.0.tgz", + "integrity": "sha512-T6KVD7rhLzFlwGRXMnxUFfkCZD8FHnb968wVXW1mXzgRFc5RNXOBY2mPPDZ77x5Ln76ltLMgtPg0cOkU1NSrEQ==", + "cpu": [ + "wasm32" + ], + "dev": true, + "license": "MIT", + "optional": true, + "dependencies": { + "@emnapi/core": "1.9.2", + "@emnapi/runtime": "1.9.2", + "@napi-rs/wasm-runtime": "^1.1.4" + }, + "engines": { + "node": "^20.19.0 || >=22.12.0" + } + }, + "node_modules/@oxc-parser/binding-wasm32-wasi/node_modules/@emnapi/runtime": { + "version": "1.9.2", + "resolved": "https://registry.npmjs.org/@emnapi/runtime/-/runtime-1.9.2.tgz", + "integrity": "sha512-3U4+MIWHImeyu1wnmVygh5WlgfYDtyf0k8AbLhMFxOipihf6nrWC4syIm/SwEeec0mNSafiiNnMJwbza/Is6Lw==", + "dev": true, + "license": "MIT", + "optional": true, + "dependencies": { + "tslib": "^2.4.0" + } + }, + "node_modules/@oxc-parser/binding-win32-arm64-msvc": { + "version": "0.127.0", + "resolved": "https://registry.npmjs.org/@oxc-parser/binding-win32-arm64-msvc/-/binding-win32-arm64-msvc-0.127.0.tgz", + "integrity": "sha512-Ujvw4X+LD1CCGULcsQcvb4YNVoBGqt+JHgNNzGGaCImELiZLk477ifUH53gIbE7EKd933NdTi25JWEr9K2HwXw==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": "^20.19.0 || >=22.12.0" + } + }, + "node_modules/@oxc-parser/binding-win32-ia32-msvc": { + "version": "0.127.0", + "resolved": "https://registry.npmjs.org/@oxc-parser/binding-win32-ia32-msvc/-/binding-win32-ia32-msvc-0.127.0.tgz", + "integrity": "sha512-0cwxKO7KHQQQfo4Uf4B2SQrhgm+cJaP9OvFFhx52Tkg4bezsacu83GB2/In5bC415Ueeym+kXdnge/57rbSfTw==", + "cpu": [ + "ia32" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": "^20.19.0 || >=22.12.0" + } + }, + "node_modules/@oxc-parser/binding-win32-x64-msvc": { + "version": "0.127.0", + "resolved": "https://registry.npmjs.org/@oxc-parser/binding-win32-x64-msvc/-/binding-win32-x64-msvc-0.127.0.tgz", + "integrity": "sha512-rOrnSQSCbhI2kowr9XxE7m9a8oQXnBHjnS6j95LxxAnEZ0+Fz20WlRXG4ondQb+ejjt2KOsa65sE6++L6kUd+w==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": "^20.19.0 || >=22.12.0" + } + }, + "node_modules/@oxc-project/types": { + "version": "0.127.0", + "resolved": "https://registry.npmjs.org/@oxc-project/types/-/types-0.127.0.tgz", + "integrity": "sha512-aIYXQBo4lCbO4z0R3FHeucQHpF46l2LbMdxRvqvuRuW2OxdnSkcng5B8+K12spgLDj93rtN3+J2Vac/TIO+ciQ==", + "dev": true, + "license": "MIT", + "funding": { + "url": "https://github.com/sponsors/Boshen" + } + }, + "node_modules/@oxc-resolver/binding-android-arm-eabi": { + "version": "11.20.0", + "resolved": "https://registry.npmjs.org/@oxc-resolver/binding-android-arm-eabi/-/binding-android-arm-eabi-11.20.0.tgz", + "integrity": "sha512-IjfWOXRgJFNdORDl+Uf1aibNgZY2guOD3zmOhx1BGVb/MIiqlFTdmjpQNplSN58lhWehnX4UNqC3QwpUo8pjJg==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ] + }, + "node_modules/@oxc-resolver/binding-android-arm64": { + "version": "11.20.0", + "resolved": "https://registry.npmjs.org/@oxc-resolver/binding-android-arm64/-/binding-android-arm64-11.20.0.tgz", + "integrity": "sha512-QqslZAuFQG8Q9xm7JuIn8JUbvywhSBMVhuQHtYW+auirZJloS41oxUUaBXk7uUhZJgp44c5zQLeVvmFaDQB+2Q==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ] + }, + "node_modules/@oxc-resolver/binding-darwin-arm64": { + "version": "11.20.0", + "resolved": "https://registry.npmjs.org/@oxc-resolver/binding-darwin-arm64/-/binding-darwin-arm64-11.20.0.tgz", + "integrity": "sha512-MUcavykj2ewlR+kc5arpg4tC2RvzJkUxWtNv74pf7lcNk00GpIpN43vXMj+j6r4eMmfZhlb8hueKoIb8e9kAGQ==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ] + }, + "node_modules/@oxc-resolver/binding-darwin-x64": { + "version": "11.20.0", + "resolved": "https://registry.npmjs.org/@oxc-resolver/binding-darwin-x64/-/binding-darwin-x64-11.20.0.tgz", + "integrity": "sha512-BGB16nRUK5Etiv//ihPyzj8Lj1px0mhh4YIfe0FDf045ywknfSm0GEbiRESpr6Q4K82AvnyaRIhhluHByvS4bg==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ] + }, + "node_modules/@oxc-resolver/binding-freebsd-x64": { + "version": "11.20.0", + "resolved": "https://registry.npmjs.org/@oxc-resolver/binding-freebsd-x64/-/binding-freebsd-x64-11.20.0.tgz", + "integrity": "sha512-JZgtePaqj3qmD5XFHJaSLWzHRxQu0LaPkdoM1KJXYADvAaa83ijXHclV3ej3CueeW0wxfIAbGCZVP45J0CA7uQ==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ] + }, + "node_modules/@oxc-resolver/binding-linux-arm-gnueabihf": { + "version": "11.20.0", + "resolved": "https://registry.npmjs.org/@oxc-resolver/binding-linux-arm-gnueabihf/-/binding-linux-arm-gnueabihf-11.20.0.tgz", + "integrity": "sha512-hOQ/p3ry3v3SchUBXicrrnszaI/UmYzM4wtS4RGfwgVUX7a+HbyQSzJ5aOzu+o6XZkFkS3ZXN4PZAzhOb77OSg==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@oxc-resolver/binding-linux-arm-musleabihf": { + "version": "11.20.0", + "resolved": "https://registry.npmjs.org/@oxc-resolver/binding-linux-arm-musleabihf/-/binding-linux-arm-musleabihf-11.20.0.tgz", + "integrity": "sha512-2ArPksaw0AqeuGBfoS715VF+JvJQAhD2niWgjE5hVO+L+nAfikVQopvngCMX9x4BD8itWoQ3dnikrQyl5Ho5Jg==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@oxc-resolver/binding-linux-arm64-gnu": { + "version": "11.20.0", + "resolved": "https://registry.npmjs.org/@oxc-resolver/binding-linux-arm64-gnu/-/binding-linux-arm64-gnu-11.20.0.tgz", + "integrity": "sha512-0bJnmYFp62JdZ4nVMDUZ/C58BCZOCcqgKtnUlp7L9Ojf/czIN+3j72YlLPeWLkzlr6SlYvIQA4SGV/HyO0d+qg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@oxc-resolver/binding-linux-arm64-musl": { + "version": "11.20.0", + "resolved": "https://registry.npmjs.org/@oxc-resolver/binding-linux-arm64-musl/-/binding-linux-arm64-musl-11.20.0.tgz", + "integrity": "sha512-wKHHzPKZo7Ufhv/Bt6yxT7FOgnIgW4gwXcJUipkShGp68W3wGVqvr1Sr0fY65lN0Oy6y41+g2kIDvkgZaMMUkw==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@oxc-resolver/binding-linux-ppc64-gnu": { + "version": "11.20.0", + "resolved": "https://registry.npmjs.org/@oxc-resolver/binding-linux-ppc64-gnu/-/binding-linux-ppc64-gnu-11.20.0.tgz", + "integrity": "sha512-RN8goF7Ie0B79L4i4G6OeBocTgSC56vJbQ65VJje+oXnldVpLnOU7j/AQ/dP94TcCS+Yh6WG8u3Qt4ETteXFNQ==", + "cpu": [ + "ppc64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@oxc-resolver/binding-linux-riscv64-gnu": { + "version": "11.20.0", + "resolved": "https://registry.npmjs.org/@oxc-resolver/binding-linux-riscv64-gnu/-/binding-linux-riscv64-gnu-11.20.0.tgz", + "integrity": "sha512-5l1yU6/xQEqLZRzxqmMxJfWPslpwCmBsdDGaBvABPehxquCXDC7dd7oraNdKSJUMDXSM7VvVj8H2D2FTjU7oWw==", + "cpu": [ + "riscv64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@oxc-resolver/binding-linux-riscv64-musl": { + "version": "11.20.0", + "resolved": "https://registry.npmjs.org/@oxc-resolver/binding-linux-riscv64-musl/-/binding-linux-riscv64-musl-11.20.0.tgz", + "integrity": "sha512-xHEvkbgz6UC+A3JOyDQy76LkUaxsNSfIr3/GV8slwZsnuooJiIB34gzJfsyvR4JdCYNUUPsRJc/w/oWkODu+hg==", + "cpu": [ + "riscv64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@oxc-resolver/binding-linux-s390x-gnu": { + "version": "11.20.0", + "resolved": "https://registry.npmjs.org/@oxc-resolver/binding-linux-s390x-gnu/-/binding-linux-s390x-gnu-11.20.0.tgz", + "integrity": "sha512-aWPDUUmSeyHvlW+SoEUd+JIJsQhVhu6a5tBpDRMu058naPAchTgAVGCFy35zjbnFlt0i8hLWziff6HX0D3LU4g==", + "cpu": [ + "s390x" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@oxc-resolver/binding-linux-x64-gnu": { + "version": "11.20.0", + "resolved": "https://registry.npmjs.org/@oxc-resolver/binding-linux-x64-gnu/-/binding-linux-x64-gnu-11.20.0.tgz", + "integrity": "sha512-x2YeSimvhJjKLVD8KSu8f/rqU1potcdEMkApIPJqjZWN7c2Fpt4g2X32WDg1p+XDAmyT7nuQGe0vnhvXeLbH+g==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@oxc-resolver/binding-linux-x64-musl": { + "version": "11.20.0", + "resolved": "https://registry.npmjs.org/@oxc-resolver/binding-linux-x64-musl/-/binding-linux-x64-musl-11.20.0.tgz", + "integrity": "sha512-kcRLEIxpZefeYfLChjpgFf3ilBzRDZ+yobMrpRsQlSrxuFGtm3U6PMU7AaEpMqo3NfDGVyJJseAjnRLzMFHjwQ==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@oxc-resolver/binding-openharmony-arm64": { + "version": "11.20.0", + "resolved": "https://registry.npmjs.org/@oxc-resolver/binding-openharmony-arm64/-/binding-openharmony-arm64-11.20.0.tgz", + "integrity": "sha512-HHcfnApSZGtKhTiHqe8OZruOZe5XuFQH5/E0Yhj3u8fnFvzkM4/k6WjacUf4SvA0SPEAbfbgYmVPuo0VX/fIBQ==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "openharmony" + ] + }, + "node_modules/@oxc-resolver/binding-wasm32-wasi": { + "version": "11.20.0", + "resolved": "https://registry.npmjs.org/@oxc-resolver/binding-wasm32-wasi/-/binding-wasm32-wasi-11.20.0.tgz", + "integrity": "sha512-Tn0y1XOFYHNfK1wp1Z5QK8Rcld/bsOwRISQXfqAZ5IBpv8Gz1IvV39fUWNprqNdRizgcvFhOzWwFun2zkJsyBg==", + "cpu": [ + "wasm32" + ], + "dev": true, + "license": "MIT", + "optional": true, + "dependencies": { + "@emnapi/core": "1.10.0", + "@emnapi/runtime": "1.10.0", + "@napi-rs/wasm-runtime": "^1.1.4" + }, + "engines": { + "node": ">=14.0.0" + } + }, + "node_modules/@oxc-resolver/binding-wasm32-wasi/node_modules/@emnapi/core": { + "version": "1.10.0", + "resolved": "https://registry.npmjs.org/@emnapi/core/-/core-1.10.0.tgz", + "integrity": "sha512-yq6OkJ4p82CAfPl0u9mQebQHKPJkY7WrIuk205cTYnYe+k2Z8YBh11FrbRG/H6ihirqcacOgl2BIO8oyMQLeXw==", + "dev": true, + "license": "MIT", + "optional": true, + "dependencies": { + "@emnapi/wasi-threads": "1.2.1", + "tslib": "^2.4.0" + } + }, + "node_modules/@oxc-resolver/binding-win32-arm64-msvc": { + "version": "11.20.0", + "resolved": "https://registry.npmjs.org/@oxc-resolver/binding-win32-arm64-msvc/-/binding-win32-arm64-msvc-11.20.0.tgz", + "integrity": "sha512-qPi25YNPe4YenS8MgsQU2+bIFHxxpLx1LVna2444cEHqNPhNjvWf9zqj4aWE43H9LpAsTmkkAlA3eL5ElBU3mA==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@oxc-resolver/binding-win32-x64-msvc": { + "version": "11.20.0", + "resolved": "https://registry.npmjs.org/@oxc-resolver/binding-win32-x64-msvc/-/binding-win32-x64-msvc-11.20.0.tgz", + "integrity": "sha512-Wb14jWEW8huH6It9F6sXd9vrYmIS7pMrgkU6sxpLxkP+9z+wRgs71hUEhRpcn8FOXAFa27FVWfY2tRpbfTzfLw==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@parcel/watcher": { + "version": "2.5.6", + "resolved": "https://registry.npmjs.org/@parcel/watcher/-/watcher-2.5.6.tgz", + "integrity": "sha512-tmmZ3lQxAe/k/+rNnXQRawJ4NjxO2hqiOLTHvWchtGZULp4RyFeh6aU4XdOYBFe2KE1oShQTv4AblOs2iOrNnQ==", + "dev": true, + "hasInstallScript": true, + "license": "MIT", + "optional": true, + "dependencies": { + "detect-libc": "^2.0.3", + "is-glob": "^4.0.3", + "node-addon-api": "^7.0.0", + "picomatch": "^4.0.3" + }, + "engines": { + "node": ">= 10.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/parcel" + }, + "optionalDependencies": { + "@parcel/watcher-android-arm64": "2.5.6", + "@parcel/watcher-darwin-arm64": "2.5.6", + "@parcel/watcher-darwin-x64": "2.5.6", + "@parcel/watcher-freebsd-x64": "2.5.6", + "@parcel/watcher-linux-arm-glibc": "2.5.6", + "@parcel/watcher-linux-arm-musl": "2.5.6", + "@parcel/watcher-linux-arm64-glibc": "2.5.6", + "@parcel/watcher-linux-arm64-musl": "2.5.6", + "@parcel/watcher-linux-x64-glibc": "2.5.6", + "@parcel/watcher-linux-x64-musl": "2.5.6", + "@parcel/watcher-win32-arm64": "2.5.6", + "@parcel/watcher-win32-ia32": "2.5.6", + "@parcel/watcher-win32-x64": "2.5.6" + } + }, + "node_modules/@parcel/watcher-android-arm64": { + "version": "2.5.6", + "resolved": "https://registry.npmjs.org/@parcel/watcher-android-arm64/-/watcher-android-arm64-2.5.6.tgz", + "integrity": "sha512-YQxSS34tPF/6ZG7r/Ih9xy+kP/WwediEUsqmtf0cuCV5TPPKw/PQHRhueUo6JdeFJaqV3pyjm0GdYjZotbRt/A==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">= 10.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/parcel" + } + }, + "node_modules/@parcel/watcher-darwin-arm64": { + "version": "2.5.6", + "resolved": "https://registry.npmjs.org/@parcel/watcher-darwin-arm64/-/watcher-darwin-arm64-2.5.6.tgz", + "integrity": "sha512-Z2ZdrnwyXvvvdtRHLmM4knydIdU9adO3D4n/0cVipF3rRiwP+3/sfzpAwA/qKFL6i1ModaabkU7IbpeMBgiVEA==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">= 10.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/parcel" + } + }, + "node_modules/@parcel/watcher-darwin-x64": { + "version": "2.5.6", + "resolved": "https://registry.npmjs.org/@parcel/watcher-darwin-x64/-/watcher-darwin-x64-2.5.6.tgz", + "integrity": "sha512-HgvOf3W9dhithcwOWX9uDZyn1lW9R+7tPZ4sug+NGrGIo4Rk1hAXLEbcH1TQSqxts0NYXXlOWqVpvS1SFS4fRg==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">= 10.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/parcel" + } + }, + "node_modules/@parcel/watcher-freebsd-x64": { + "version": "2.5.6", + "resolved": "https://registry.npmjs.org/@parcel/watcher-freebsd-x64/-/watcher-freebsd-x64-2.5.6.tgz", + "integrity": "sha512-vJVi8yd/qzJxEKHkeemh7w3YAn6RJCtYlE4HPMoVnCpIXEzSrxErBW5SJBgKLbXU3WdIpkjBTeUNtyBVn8TRng==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ], + "engines": { + "node": ">= 10.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/parcel" + } + }, + "node_modules/@parcel/watcher-linux-arm-glibc": { + "version": "2.5.6", + "resolved": "https://registry.npmjs.org/@parcel/watcher-linux-arm-glibc/-/watcher-linux-arm-glibc-2.5.6.tgz", + "integrity": "sha512-9JiYfB6h6BgV50CCfasfLf/uvOcJskMSwcdH1PHH9rvS1IrNy8zad6IUVPVUfmXr+u+Km9IxcfMLzgdOudz9EQ==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/parcel" + } + }, + "node_modules/@parcel/watcher-linux-arm-musl": { + "version": "2.5.6", + "resolved": "https://registry.npmjs.org/@parcel/watcher-linux-arm-musl/-/watcher-linux-arm-musl-2.5.6.tgz", + "integrity": "sha512-Ve3gUCG57nuUUSyjBq/MAM0CzArtuIOxsBdQ+ftz6ho8n7s1i9E1Nmk/xmP323r2YL0SONs1EuwqBp2u1k5fxg==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/parcel" + } + }, + "node_modules/@parcel/watcher-linux-arm64-glibc": { + "version": "2.5.6", + "resolved": "https://registry.npmjs.org/@parcel/watcher-linux-arm64-glibc/-/watcher-linux-arm64-glibc-2.5.6.tgz", + "integrity": "sha512-f2g/DT3NhGPdBmMWYoxixqYr3v/UXcmLOYy16Bx0TM20Tchduwr4EaCbmxh1321TABqPGDpS8D/ggOTaljijOA==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/parcel" + } + }, + "node_modules/@parcel/watcher-linux-arm64-musl": { + "version": "2.5.6", + "resolved": "https://registry.npmjs.org/@parcel/watcher-linux-arm64-musl/-/watcher-linux-arm64-musl-2.5.6.tgz", + "integrity": "sha512-qb6naMDGlbCwdhLj6hgoVKJl2odL34z2sqkC7Z6kzir8b5W65WYDpLB6R06KabvZdgoHI/zxke4b3zR0wAbDTA==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/parcel" + } + }, + "node_modules/@parcel/watcher-linux-x64-glibc": { + "version": "2.5.6", + "resolved": "https://registry.npmjs.org/@parcel/watcher-linux-x64-glibc/-/watcher-linux-x64-glibc-2.5.6.tgz", + "integrity": "sha512-kbT5wvNQlx7NaGjzPFu8nVIW1rWqV780O7ZtkjuWaPUgpv2NMFpjYERVi0UYj1msZNyCzGlaCWEtzc+exjMGbQ==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/parcel" + } + }, + "node_modules/@parcel/watcher-linux-x64-musl": { + "version": "2.5.6", + "resolved": "https://registry.npmjs.org/@parcel/watcher-linux-x64-musl/-/watcher-linux-x64-musl-2.5.6.tgz", + "integrity": "sha512-1JRFeC+h7RdXwldHzTsmdtYR/Ku8SylLgTU/reMuqdVD7CtLwf0VR1FqeprZ0eHQkO0vqsbvFLXUmYm/uNKJBg==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/parcel" + } + }, + "node_modules/@parcel/watcher-win32-arm64": { + "version": "2.5.6", + "resolved": "https://registry.npmjs.org/@parcel/watcher-win32-arm64/-/watcher-win32-arm64-2.5.6.tgz", + "integrity": "sha512-3ukyebjc6eGlw9yRt678DxVF7rjXatWiHvTXqphZLvo7aC5NdEgFufVwjFfY51ijYEWpXbqF5jtrK275z52D4Q==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">= 10.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/parcel" + } + }, + "node_modules/@parcel/watcher-win32-ia32": { + "version": "2.5.6", + "resolved": "https://registry.npmjs.org/@parcel/watcher-win32-ia32/-/watcher-win32-ia32-2.5.6.tgz", + "integrity": "sha512-k35yLp1ZMwwee3Ez/pxBi5cf4AoBKYXj00CZ80jUz5h8prpiaQsiRPKQMxoLstNuqe2vR4RNPEAEcjEFzhEz/g==", + "cpu": [ + "ia32" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">= 10.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/parcel" + } + }, + "node_modules/@parcel/watcher-win32-x64": { + "version": "2.5.6", + "resolved": "https://registry.npmjs.org/@parcel/watcher-win32-x64/-/watcher-win32-x64-2.5.6.tgz", + "integrity": "sha512-hbQlYcCq5dlAX9Qx+kFb0FHue6vbjlf0FrNzSKdYK2APUf7tGfGxQCk2ihEREmbR6ZMc0MVAD5RIX/41gpUzTw==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">= 10.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/parcel" + } + }, + "node_modules/@playwright/test": { + "version": "1.56.1", + "resolved": "https://registry.npmjs.org/@playwright/test/-/test-1.56.1.tgz", + "integrity": "sha512-vSMYtL/zOcFpvJCW71Q/OEGQb7KYBPAdKh35WNSkaZA75JlAO8ED8UN6GUNTm3drWomcbcqRPFqQbLae8yBTdg==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "playwright": "1.56.1" + }, + "bin": { + "playwright": "cli.js" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/@polka/url": { + "version": "1.0.0-next.29", + "resolved": "https://registry.npmjs.org/@polka/url/-/url-1.0.0-next.29.tgz", + "integrity": "sha512-wwQAWhWSuHaag8c4q/KN/vCoeOJYshAIvMQwD4GpSb3OiZklFfvAgmj0VCBBImRpuF/aFgIRzllXlVX93Jevww==", + "dev": true, + "license": "MIT" + }, + "node_modules/@quansync/fs": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/@quansync/fs/-/fs-1.0.0.tgz", + "integrity": "sha512-4TJ3DFtlf1L5LDMaM6CanJ/0lckGNtJcMjQ1NAV6zDmA0tEHKZtxNKin8EgPaVX1YzljbxckyT2tJrpQKAtngQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "quansync": "^1.0.0" + }, + "funding": { + "url": "https://github.com/sponsors/sxzz" + } + }, + "node_modules/@rollup/plugin-babel": { + "version": "6.1.0", + "resolved": "https://registry.npmjs.org/@rollup/plugin-babel/-/plugin-babel-6.1.0.tgz", + "integrity": "sha512-dFZNuFD2YRcoomP4oYf+DvQNSUA9ih+A3vUqopQx5EdtPGo3WBnQcI/S8pwpz91UsGfL0HsMSOlaMld8HrbubA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@babel/helper-module-imports": "^7.18.6", + "@rollup/pluginutils": "^5.0.1" + }, + "engines": { + "node": ">=14.0.0" + }, + "peerDependencies": { + "@babel/core": "^7.0.0", + "@types/babel__core": "^7.1.9", + "rollup": "^1.20.0||^2.0.0||^3.0.0||^4.0.0" + }, + "peerDependenciesMeta": { + "@types/babel__core": { + "optional": true + }, + "rollup": { + "optional": true + } + } + }, + "node_modules/@rollup/plugin-node-resolve": { + "version": "16.0.3", + "resolved": "https://registry.npmjs.org/@rollup/plugin-node-resolve/-/plugin-node-resolve-16.0.3.tgz", + "integrity": "sha512-lUYM3UBGuM93CnMPG1YocWu7X802BrNF3jW2zny5gQyLQgRFJhV1Sq0Zi74+dh/6NBx1DxFC4b4GXg9wUCG5Qg==", + "dev": true, + "license": "MIT", + "dependencies": { + "@rollup/pluginutils": "^5.0.1", + "@types/resolve": "1.20.2", + "deepmerge": "^4.2.2", + "is-module": "^1.0.0", + "resolve": "^1.22.1" + }, + "engines": { + "node": ">=14.0.0" + }, + "peerDependencies": { + "rollup": "^2.78.0||^3.0.0||^4.0.0" + }, + "peerDependenciesMeta": { + "rollup": { + "optional": true + } + } + }, + "node_modules/@rollup/plugin-replace": { + "version": "6.0.3", + "resolved": "https://registry.npmjs.org/@rollup/plugin-replace/-/plugin-replace-6.0.3.tgz", + "integrity": "sha512-J4RZarRvQAm5IF0/LwUUg+obsm+xZhYnbMXmXROyoSE1ATJe3oXSb9L5MMppdxP2ylNSjv6zFBwKYjcKMucVfA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@rollup/pluginutils": "^5.0.1", + "magic-string": "^0.30.3" + }, + "engines": { + "node": ">=14.0.0" + }, + "peerDependencies": { + "rollup": "^1.20.0||^2.0.0||^3.0.0||^4.0.0" + }, + "peerDependenciesMeta": { + "rollup": { + "optional": true + } + } + }, + "node_modules/@rollup/plugin-terser": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/@rollup/plugin-terser/-/plugin-terser-1.0.0.tgz", + "integrity": "sha512-FnCxhTBx6bMOYQrar6C8h3scPt8/JwIzw3+AJ2K++6guogH5fYaIFia+zZuhqv0eo1RN7W1Pz630SyvLbDjhtQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "serialize-javascript": "^7.0.3", + "smob": "^1.0.0", + "terser": "^5.17.4" + }, + "engines": { + "node": ">=20.0.0" + }, + "peerDependencies": { + "rollup": "^2.0.0||^3.0.0||^4.0.0" + }, + "peerDependenciesMeta": { + "rollup": { + "optional": true + } + } + }, + "node_modules/@rollup/pluginutils": { + "version": "5.4.0", + "resolved": "https://registry.npmjs.org/@rollup/pluginutils/-/pluginutils-5.4.0.tgz", + "integrity": "sha512-MfPp06CjRLfXQ3wY0R8vJDYBy/MvVcc9OulEfR0B8Iv9ko+GCNaRZ+EpJYFl27LhKsZK0o420sYCRHCjfCgeUg==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/estree": "^1.0.0", + "estree-walker": "^2.0.2", + "picomatch": "^4.0.2" + }, + "engines": { + "node": ">=14.0.0" + }, + "peerDependencies": { + "rollup": "^1.20.0||^2.0.0||^3.0.0||^4.0.0" + }, + "peerDependenciesMeta": { + "rollup": { + "optional": true + } + } + }, + "node_modules/@rollup/pluginutils/node_modules/estree-walker": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/estree-walker/-/estree-walker-2.0.2.tgz", + "integrity": "sha512-Rfkk/Mp/DL7JVje3u18FxFujQlTNR2q6QfMSMB7AvCBx91NGj/ba3kCfza0f6dVDbw7YlRf/nDrn7pQrCCyQ/w==", + "dev": true, + "license": "MIT" + }, + "node_modules/@rollup/rollup-android-arm-eabi": { + "version": "4.61.1", + "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.61.1.tgz", + "integrity": "sha512-JnBB8MdXj45cajvTuO5FmPlvFVJRQgvrz1uSEl3NwqFnReAPGwb8EanbGi4z2nRaqLzjJSv5/JmycoTKlRZxHA==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ] + }, + "node_modules/@rollup/rollup-android-arm64": { + "version": "4.61.1", + "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm64/-/rollup-android-arm64-4.61.1.tgz", + "integrity": "sha512-Jx2g7iSjw4AOT0HDPHM9RV3GNjRXwybWtSFZiZAYUTjUwjVrYIwq3kBf+LnhqJlzXFAqTAh2F7IGI+O568exPw==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ] + }, + "node_modules/@rollup/rollup-darwin-arm64": { + "version": "4.61.1", + "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-arm64/-/rollup-darwin-arm64-4.61.1.tgz", + "integrity": "sha512-0F1L/Z3Eqv8mT2n3dCpeO8GcTvHvVqkP5/t6DMsn0KzhYVcg+s7Ncl5DS8qjKYEeio6Az0Gt6nyBORay5qIlCA==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ] + }, + "node_modules/@rollup/rollup-darwin-x64": { + "version": "4.61.1", + "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-x64/-/rollup-darwin-x64-4.61.1.tgz", + "integrity": "sha512-qLttcH871ujY4YcVfUSShhOw+CsoTatYz8gRbHO7Bb92QH059/P0y5do1KMs41fY0BpD2x4AJH/gID0zFiqVKQ==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ] + }, + "node_modules/@rollup/rollup-freebsd-arm64": { + "version": "4.61.1", + "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-arm64/-/rollup-freebsd-arm64-4.61.1.tgz", + "integrity": "sha512-fUI4RapGE0Oh3mb8mgfvC1O2nU1RpDZUKnDQm3xB1Ipg7C2wTs5Kstz7G2uWK99a8S2yTMq8/P4uycwNa0nJyw==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ] + }, + "node_modules/@rollup/rollup-freebsd-x64": { + "version": "4.61.1", + "resolved": "https://registry.npmjs.org/@rollup/rollup-freebsd-x64/-/rollup-freebsd-x64-4.61.1.tgz", + "integrity": "sha512-H5YrdvJaDtI/U9/emrD4b++xkvp3y/JvOe4rizHbxvkyMfRS/CiRYdji+Pl8D0brEaNFWUh1drQxgAGIl6Xudw==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ] + }, + "node_modules/@rollup/rollup-linux-arm-gnueabihf": { + "version": "4.61.1", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-gnueabihf/-/rollup-linux-arm-gnueabihf-4.61.1.tgz", + "integrity": "sha512-Q8CBCCQtDFrYtXoeUXSrnFXKOnyUhx6bz+SkL6A0E7V8kAiCJ5pamq1WtbfpVGhR5TSpXY6ak3avmDc5fHTyJA==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-arm-musleabihf": { + "version": "4.61.1", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-musleabihf/-/rollup-linux-arm-musleabihf-4.61.1.tgz", + "integrity": "sha512-nwnhk1581l0FBVellGcVCAT0Oi06onEA3WB53sf01VO3I0UPBkMH9sXONYME2K0ovXcNayJfNtHfm6mpJElatQ==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-arm64-gnu": { + "version": "4.61.1", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-gnu/-/rollup-linux-arm64-gnu-4.61.1.tgz", + "integrity": "sha512-x5Xr49hwt3hdW75UOZm3395YwwzPyauktslv29KpWL/T+vVAzoT3azLcTWv0eMciBNrx+DYjH4paehHoLpPvpg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-arm64-musl": { + "version": "4.61.1", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-musl/-/rollup-linux-arm64-musl-4.61.1.tgz", + "integrity": "sha512-unMS3H73DpaoPyyEVPjGKleM/s0mkmsauTENpw4INQY8y4+IuLNjkueQ5QCtC0D3N38Y38yhAU8OoZ20S2Tm6w==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-loong64-gnu": { + "version": "4.61.1", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-loong64-gnu/-/rollup-linux-loong64-gnu-4.61.1.tgz", + "integrity": "sha512-zNZzGRnAhwjFEYmvphJRV5XaQGjs62cCmeYYHUT//NbvEnHauw+I85nGG+SiVg5ld4GX8D1IbKIX+ozITQnhMQ==", + "cpu": [ + "loong64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-loong64-musl": { + "version": "4.61.1", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-loong64-musl/-/rollup-linux-loong64-musl-4.61.1.tgz", + "integrity": "sha512-LdpWGL8X209B2SIvWjqlc8VZgM6PKfontSerGepuldQmHYrAOtnMCXeJkxXGbC+PPZVOuu5czJo7fNV6aeW8rQ==", + "cpu": [ + "loong64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-ppc64-gnu": { + "version": "4.61.1", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-ppc64-gnu/-/rollup-linux-ppc64-gnu-4.61.1.tgz", + "integrity": "sha512-EC5kTtNaNGOmbMGqar8dvJy6y/hg99GAwjfBz++pxZhQATXGcRjd6c5en5wcbru0vkRmiMGsQKdMJOOf6sza4g==", + "cpu": [ + "ppc64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-ppc64-musl": { + "version": "4.61.1", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-ppc64-musl/-/rollup-linux-ppc64-musl-4.61.1.tgz", + "integrity": "sha512-8hiwp6D4acEcNK78I4rP0/XtS1sknWIAMJBPdR4l6zUtyTm5KiTDr5bXmWt4foY7nAN7AThDHgkLIEZOWKbzWw==", + "cpu": [ + "ppc64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-riscv64-gnu": { + "version": "4.61.1", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-gnu/-/rollup-linux-riscv64-gnu-4.61.1.tgz", + "integrity": "sha512-10dh/h/BqA7DuMPWSxkR8uks18FRwnwOEqr5zOTEl+NOwP/OMzKX8OFR/Of9xxDA7D5qef1Nzar5WDD2kCCr1g==", + "cpu": [ + "riscv64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-riscv64-musl": { + "version": "4.61.1", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-musl/-/rollup-linux-riscv64-musl-4.61.1.tgz", + "integrity": "sha512-YKJ5lg35DP17gcAOggnihe+APw9HLyj1Xn7gsmGumBJAUDa6NGXNixJzmkWLhcK9TOuuyQjdamzvJefkO7qHZQ==", + "cpu": [ + "riscv64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-s390x-gnu": { + "version": "4.61.1", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-s390x-gnu/-/rollup-linux-s390x-gnu-4.61.1.tgz", + "integrity": "sha512-Mlil5G2Jj6a7B3LWGctg+XPL9vdXYuzCtNXfxOQ0nPjc2m6ueUktocPGH9bnAM0bNRKb/bAWTujUU7IJQdQA+g==", + "cpu": [ + "s390x" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-x64-gnu": { + "version": "4.61.1", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-gnu/-/rollup-linux-x64-gnu-4.61.1.tgz", + "integrity": "sha512-bVWIOIk6pV01p4CdUbPP7CJ/434z+OooYjDuFcR+44N35YvKUC66G8MGnvcWx5mWKW3g61J+t74l3Kj15Kwn2Q==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-x64-musl": { + "version": "4.61.1", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-musl/-/rollup-linux-x64-musl-4.61.1.tgz", + "integrity": "sha512-qy5pBvZbqNFheBz61R1rzsezjm0J7O2oNGoWtGoY89SZYLUfxAJTBAqDChqAIdB4rCiIbi9nF7yZ83GnNiLwSw==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-openbsd-x64": { + "version": "4.61.1", + "resolved": "https://registry.npmjs.org/@rollup/rollup-openbsd-x64/-/rollup-openbsd-x64-4.61.1.tgz", + "integrity": "sha512-E83TXjI4zm0+5f2qO+UOudaCYIhYwpJ5jq6YCZNIZ+6CbfhKrkAGezeiASBL9ElxAxFsRS9ZhESv8mfnj6TKeg==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "openbsd" + ] + }, + "node_modules/@rollup/rollup-openharmony-arm64": { + "version": "4.61.1", + "resolved": "https://registry.npmjs.org/@rollup/rollup-openharmony-arm64/-/rollup-openharmony-arm64-4.61.1.tgz", + "integrity": "sha512-fbWnKqVkjrJN38vNe3ahkbk6iejS/3b0Nt7EEtPpE6RBacZcGXNKbzfHN3GUUlXOPghUg0j6XUGrtjX9z1sIvA==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "openharmony" + ] + }, + "node_modules/@rollup/rollup-win32-arm64-msvc": { + "version": "4.61.1", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-arm64-msvc/-/rollup-win32-arm64-msvc-4.61.1.tgz", + "integrity": "sha512-ArMl38iVAbk0New1ogihQNY6iphLi4ZaRsa037gUzv5yeKPY8TD3Dmy4x2RNC1VztU/uqm+G+/RwFrSka3Oy2g==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@rollup/rollup-win32-ia32-msvc": { + "version": "4.61.1", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-ia32-msvc/-/rollup-win32-ia32-msvc-4.61.1.tgz", + "integrity": "sha512-0mYtjHS9ucAbcATycCNK9IGBk/cCe/ma7EmSLGZdsxnOA8cjRIyU04wDpVAD9NiOfLUR9KTxdiO53uOkherqjQ==", + "cpu": [ + "ia32" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@rollup/rollup-win32-x64-gnu": { + "version": "4.61.1", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-gnu/-/rollup-win32-x64-gnu-4.61.1.tgz", + "integrity": "sha512-gK1iCEPfpoSG9wfBihXxvBMi8ZfcWffYkEsC/Eih+iFENTaewvNcrEQ69lIOWYO5pePHKLHHO7nq5AILGO/HQQ==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@rollup/rollup-win32-x64-msvc": { + "version": "4.61.1", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-msvc/-/rollup-win32-x64-msvc-4.61.1.tgz", + "integrity": "sha512-X+zaP2x+j4RXGfbp/seSoRHWnPxzApilDszisZxbYH5C/jTxFhCtDNdPGZb9lJyYPs24wGxruPF7Y+sIXt9Gzw==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@standard-schema/spec": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@standard-schema/spec/-/spec-1.1.0.tgz", + "integrity": "sha512-l2aFy5jALhniG5HgqrD6jXLi/rUWrKvqN/qJx6yoJsgKhblVd+iqqU4RCXavm/jPityDo5TCvKMnpjKnOriy0w==", + "dev": true, + "license": "MIT" + }, + "node_modules/@storybook/addon-a11y": { + "version": "10.2.4", + "resolved": "https://registry.npmjs.org/@storybook/addon-a11y/-/addon-a11y-10.2.4.tgz", + "integrity": "sha512-VGhdZ+iP2l/CSulIKV2kt3SMWVHntOigqWqGkNYf6YNYofynUYEKdsNqBvHx4ySuNEl/eXJ8LRO8FKYnU7LxZQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@storybook/global": "^5.0.0", + "axe-core": "^4.2.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/storybook" + }, + "peerDependencies": { + "storybook": "^10.2.4" + } + }, + "node_modules/@storybook/addon-docs": { + "version": "10.2.4", + "resolved": "https://registry.npmjs.org/@storybook/addon-docs/-/addon-docs-10.2.4.tgz", + "integrity": "sha512-FzscAmdBiOGnGrxiEM+8eTg43kjqgjLfObg+lbJVRR/a0DmZ3xfAPNB0+VKYQbN0FacNcWLM9LZ/7U0hRBPBnQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@mdx-js/react": "^3.0.0", + "@storybook/csf-plugin": "10.2.4", + "@storybook/icons": "^2.0.1", + "@storybook/react-dom-shim": "10.2.4", + "react": "^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0", + "react-dom": "^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0", + "ts-dedent": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/storybook" + }, + "peerDependencies": { + "storybook": "^10.2.4" + } + }, + "node_modules/@storybook/addon-svelte-csf": { + "version": "5.0.10", + "resolved": "https://registry.npmjs.org/@storybook/addon-svelte-csf/-/addon-svelte-csf-5.0.10.tgz", + "integrity": "sha512-poSvTS7VdaQ42ZoqW5e4+2Hv1iLO0mekH9fwn/QuBNse48R4WlTyR8XFbHRTfatl9gdc9ZYC4uWzazrmV6zGIA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@storybook/csf": "^0.1.13", + "dedent": "^1.5.3", + "es-toolkit": "^1.26.1", + "esrap": "^1.2.2", + "magic-string": "^0.30.12", + "svelte-ast-print": "^0.4.0", + "zimmerframe": "^1.1.2" + }, + "peerDependencies": { + "@storybook/svelte": "^0.0.0-0 || ^8.2.0 || ^9.0.0 || ^9.1.0-0 || ^10.0.0-0", + "@sveltejs/vite-plugin-svelte": "^4.0.0 || ^5.0.0 || ^6.0.0", + "storybook": "^0.0.0-0 || ^8.2.0 || ^9.0.0 || ^9.1.0-0 || ^10.0.0-0", + "svelte": "^5.0.0", + "vite": "^5.0.0 || ^6.0.0 || ^7.0.0" + } + }, + "node_modules/@storybook/addon-vitest": { + "version": "10.2.4", + "resolved": "https://registry.npmjs.org/@storybook/addon-vitest/-/addon-vitest-10.2.4.tgz", + "integrity": "sha512-BT1iP89U4wcbpzTURU8WYTAeUcdNh4WIt0BqsnATmMwR/jKNJW6QgXCVqGQTSpRjWj40hX5e2JkQYCNXdjKsPw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@storybook/global": "^5.0.0", + "@storybook/icons": "^2.0.1" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/storybook" + }, + "peerDependencies": { + "@vitest/browser": "^3.0.0 || ^4.0.0", + "@vitest/browser-playwright": "^4.0.0", + "@vitest/runner": "^3.0.0 || ^4.0.0", + "storybook": "^10.2.4", + "vitest": "^3.0.0 || ^4.0.0" + }, + "peerDependenciesMeta": { + "@vitest/browser": { + "optional": true + }, + "@vitest/browser-playwright": { + "optional": true + }, + "@vitest/runner": { + "optional": true + }, + "vitest": { + "optional": true + } + } + }, + "node_modules/@storybook/builder-vite": { + "version": "10.2.4", + "resolved": "https://registry.npmjs.org/@storybook/builder-vite/-/builder-vite-10.2.4.tgz", + "integrity": "sha512-/hcT1xj3CL5GkJ5v5/EguZdttDwNE6weNXK7vKzp034tnGcLycOossDsTiUQkBowSL+Ylc8aKj+ZgvddPNfOig==", + "dev": true, + "license": "MIT", + "dependencies": { + "@storybook/csf-plugin": "10.2.4", + "ts-dedent": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/storybook" + }, + "peerDependencies": { + "storybook": "^10.2.4", + "vite": "^5.0.0 || ^6.0.0 || ^7.0.0" + } + }, + "node_modules/@storybook/csf": { + "version": "0.1.13", + "resolved": "https://registry.npmjs.org/@storybook/csf/-/csf-0.1.13.tgz", + "integrity": "sha512-7xOOwCLGB3ebM87eemep89MYRFTko+D8qE7EdAAq74lgdqRR5cOUtYWJLjO2dLtP94nqoOdHJo6MdLLKzg412Q==", + "dev": true, + "license": "MIT", + "dependencies": { + "type-fest": "^2.19.0" + } + }, + "node_modules/@storybook/csf-plugin": { + "version": "10.2.4", + "resolved": "https://registry.npmjs.org/@storybook/csf-plugin/-/csf-plugin-10.2.4.tgz", + "integrity": "sha512-kupPQEV+4N9mzsZHYaokvhO/KHBjYdWda9PNmPQwy0TR7r2mzthgaNH72TjmgN1L6DIbsuyOG1wtczcPJn4+Jg==", + "dev": true, + "license": "MIT", + "dependencies": { + "unplugin": "^2.3.5" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/storybook" + }, + "peerDependencies": { + "esbuild": "*", + "rollup": "*", + "storybook": "^10.2.4", + "vite": "*", + "webpack": "*" + }, + "peerDependenciesMeta": { + "esbuild": { + "optional": true + }, + "rollup": { + "optional": true + }, + "vite": { + "optional": true + }, + "webpack": { + "optional": true + } + } + }, + "node_modules/@storybook/global": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/@storybook/global/-/global-5.0.0.tgz", + "integrity": "sha512-FcOqPAXACP0I3oJ/ws6/rrPT9WGhu915Cg8D02a9YxLo0DE9zI+a9A5gRGvmQ09fiWPukqI8ZAEoQEdWUKMQdQ==", + "dev": true, + "license": "MIT" + }, + "node_modules/@storybook/icons": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/@storybook/icons/-/icons-2.0.2.tgz", + "integrity": "sha512-KZBCpXsshAIjczYNXR/rlxEtCUX/eAbpFNwKi8bcOomrLA4t/SyPz5RF+lVPO2oZBUE4sAkt43mfJUevQDSEEw==", + "dev": true, + "license": "MIT", + "peerDependencies": { + "react": "^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0", + "react-dom": "^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0" + } + }, + "node_modules/@storybook/react-dom-shim": { + "version": "10.2.4", + "resolved": "https://registry.npmjs.org/@storybook/react-dom-shim/-/react-dom-shim-10.2.4.tgz", + "integrity": "sha512-i22OtrZ7GeZPt/odLf0vqyDhRSKyaLsHkkKSBcANQfzRRnBZmiz2FchOtWm9uvoDWybQsTruZq7kTdtpEhwyGw==", + "dev": true, + "license": "MIT", + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/storybook" + }, + "peerDependencies": { + "react": "^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0", + "react-dom": "^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0", + "storybook": "^10.2.4" + } + }, + "node_modules/@storybook/svelte": { + "version": "10.2.4", + "resolved": "https://registry.npmjs.org/@storybook/svelte/-/svelte-10.2.4.tgz", + "integrity": "sha512-W9R51zUCd2iHOQBg/D93+bdpYv6kbtFx+kft5X8lPKQl6yEu0aKs9i5N5GyCASOhIApgx/tkqZIJ7vgM4cqrHA==", + "dev": true, + "license": "MIT", + "dependencies": { + "ts-dedent": "^2.0.0", + "type-fest": "~2.19" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/storybook" + }, + "peerDependencies": { + "storybook": "^10.2.4", + "svelte": "^5.0.0" + } + }, + "node_modules/@storybook/svelte-vite": { + "version": "10.2.4", + "resolved": "https://registry.npmjs.org/@storybook/svelte-vite/-/svelte-vite-10.2.4.tgz", + "integrity": "sha512-FMgKMRdoZFDwPD6eIDMldcgp6d6NtIGuXyUJjb29qLias/gE5TI6hg+cWmmWXQRTrXwdyepeMBmIfRcZbB6REQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@storybook/builder-vite": "10.2.4", + "@storybook/svelte": "10.2.4", + "magic-string": "^0.30.0", + "svelte2tsx": "^0.7.44", + "typescript": "^4.9.4 || ^5.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/storybook" + }, + "peerDependencies": { + "@sveltejs/vite-plugin-svelte": "^2.0.0 || ^3.0.0 || ^4.0.0 || ^5.0.0 || ^6.0.0", + "storybook": "^10.2.4", + "svelte": "^5.0.0", + "vite": "^5.0.0 || ^6.0.0 || ^7.0.0" + } + }, + "node_modules/@storybook/sveltekit": { + "version": "10.2.4", + "resolved": "https://registry.npmjs.org/@storybook/sveltekit/-/sveltekit-10.2.4.tgz", + "integrity": "sha512-1qDX35iSJHWo1AOd7HMzJtCHBfgahXqTWNiyZa/JMEKJ3qC1otaU8XMmTjsZ6fCRF99piNdgqtWM8+s1TJOldg==", + "dev": true, + "license": "MIT", + "dependencies": { + "@storybook/builder-vite": "10.2.4", + "@storybook/svelte": "10.2.4", + "@storybook/svelte-vite": "10.2.4" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/storybook" + }, + "peerDependencies": { + "storybook": "^10.2.4", + "svelte": "^5.0.0", + "vite": "^5.0.0 || ^6.0.0 || ^7.0.0" + } + }, + "node_modules/@sveltejs/acorn-typescript": { + "version": "1.0.10", + "resolved": "https://registry.npmjs.org/@sveltejs/acorn-typescript/-/acorn-typescript-1.0.10.tgz", + "integrity": "sha512-4WfKk68eTih+MiJD4fSbxN7E8kVBmTMPWHUPYjvl2N0rMs53YLTT8/YjKU5Dtnz5LqDjl7LEw4U7lXR2W3J5WA==", + "dev": true, + "license": "MIT", + "peerDependencies": { + "acorn": "^8.9.0" + } + }, + "node_modules/@sveltejs/adapter-static": { + "version": "3.0.10", + "resolved": "https://registry.npmjs.org/@sveltejs/adapter-static/-/adapter-static-3.0.10.tgz", + "integrity": "sha512-7D9lYFWJmB7zxZyTE/qxjksvMqzMuYrrsyh1f4AlZqeZeACPRySjbC3aFiY55wb1tWUaKOQG9PVbm74JcN2Iew==", + "dev": true, + "license": "MIT", + "peerDependencies": { + "@sveltejs/kit": "^2.0.0" + } + }, + "node_modules/@sveltejs/kit": { + "version": "2.60.1", + "resolved": "https://registry.npmjs.org/@sveltejs/kit/-/kit-2.60.1.tgz", + "integrity": "sha512-mQjlkNo+rJvpln7V2IGY2j99BqhcFbS4UN0AQNKNYfhBAFZTuCDAdW3a1sgf330mvtNvsBXn3HpAhcmvdJTcIQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@standard-schema/spec": "^1.0.0", + "@sveltejs/acorn-typescript": "^1.0.5", + "@types/cookie": "^0.6.0", + "acorn": "^8.14.1", + "cookie": "^0.6.0", + "devalue": "^5.8.1", + "esm-env": "^1.2.2", + "kleur": "^4.1.5", + "magic-string": "^0.30.5", + "mrmime": "^2.0.0", + "set-cookie-parser": "^3.0.0", + "sirv": "^3.0.0" + }, + "bin": { + "svelte-kit": "svelte-kit.js" + }, + "engines": { + "node": ">=18.13" + }, + "peerDependencies": { + "@opentelemetry/api": "^1.0.0", + "@sveltejs/vite-plugin-svelte": "^3.0.0 || ^4.0.0-next.1 || ^5.0.0 || ^6.0.0-next.0 || ^7.0.0", + "svelte": "^4.0.0 || ^5.0.0-next.0", + "typescript": "^5.3.3 || ^6.0.0", + "vite": "^5.0.3 || ^6.0.0 || ^7.0.0-beta.0 || ^8.0.0" + }, + "peerDependenciesMeta": { + "@opentelemetry/api": { + "optional": true + }, + "typescript": { + "optional": true + } + } + }, + "node_modules/@sveltejs/load-config": { + "version": "0.1.1", + "resolved": "https://registry.npmjs.org/@sveltejs/load-config/-/load-config-0.1.1.tgz", + "integrity": "sha512-BXXm+VOH/9X4N7Dd1iZ2MqA1h7M+9i2noI8QYuLDY8QcN2WHYn7D/VK/+IJNfcAmRw7ACNJ538UT9GXIhnBTiA==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 18.0.0" + } + }, + "node_modules/@sveltejs/vite-plugin-svelte": { + "version": "6.2.1", + "resolved": "https://registry.npmjs.org/@sveltejs/vite-plugin-svelte/-/vite-plugin-svelte-6.2.1.tgz", + "integrity": "sha512-YZs/OSKOQAQCnJvM/P+F1URotNnYNeU3P2s4oIpzm1uFaqUEqRxUB0g5ejMjEb5Gjb9/PiBI5Ktrq4rUUF8UVQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@sveltejs/vite-plugin-svelte-inspector": "^5.0.0", + "debug": "^4.4.1", + "deepmerge": "^4.3.1", + "magic-string": "^0.30.17", + "vitefu": "^1.1.1" + }, + "engines": { + "node": "^20.19 || ^22.12 || >=24" + }, + "peerDependencies": { + "svelte": "^5.0.0", + "vite": "^6.3.0 || ^7.0.0" + } + }, + "node_modules/@sveltejs/vite-plugin-svelte-inspector": { + "version": "5.0.2", + "resolved": "https://registry.npmjs.org/@sveltejs/vite-plugin-svelte-inspector/-/vite-plugin-svelte-inspector-5.0.2.tgz", + "integrity": "sha512-TZzRTcEtZffICSAoZGkPSl6Etsj2torOVrx6Uw0KpXxrec9Gg6jFWQ60Q3+LmNGfZSxHRCZL7vXVZIWmuV50Ig==", + "dev": true, + "license": "MIT", + "dependencies": { + "obug": "^2.1.0" + }, + "engines": { + "node": "^20.19 || ^22.12 || >=24" + }, + "peerDependencies": { + "@sveltejs/vite-plugin-svelte": "^6.0.0-next.0", + "svelte": "^5.0.0", + "vite": "^6.3.0 || ^7.0.0" + } + }, + "node_modules/@swc/helpers": { + "version": "0.5.23", + "resolved": "https://registry.npmjs.org/@swc/helpers/-/helpers-0.5.23.tgz", + "integrity": "sha512-5lSsMOTXURePglDfvuAQUqkGek9Hg2kksOYay2m0+XR++b2NWYL/4sWyuvVBIs8oKnJaxkdi9whaL/sqN13afw==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "tslib": "^2.8.0" + } + }, + "node_modules/@tailwindcss/forms": { + "version": "0.5.10", + "resolved": "https://registry.npmjs.org/@tailwindcss/forms/-/forms-0.5.10.tgz", + "integrity": "sha512-utI1ONF6uf/pPNO68kmN1b8rEwNXv3czukalo8VtJH8ksIkZXr3Q3VYudZLkCsDd4Wku120uF02hYK25XGPorw==", + "dev": true, + "license": "MIT", + "dependencies": { + "mini-svg-data-uri": "^1.2.3" + }, + "peerDependencies": { + "tailwindcss": ">=3.0.0 || >= 3.0.0-alpha.1 || >= 4.0.0-alpha.20 || >= 4.0.0-beta.1" + } + }, + "node_modules/@tailwindcss/node": { + "version": "4.1.11", + "resolved": "https://registry.npmjs.org/@tailwindcss/node/-/node-4.1.11.tgz", + "integrity": "sha512-yzhzuGRmv5QyU9qLNg4GTlYI6STedBWRE7NjxP45CsFYYq9taI0zJXZBMqIC/c8fViNLhmrbpSFS57EoxUmD6Q==", + "dev": true, + "license": "MIT", + "dependencies": { + "@ampproject/remapping": "^2.3.0", + "enhanced-resolve": "^5.18.1", + "jiti": "^2.4.2", + "lightningcss": "1.30.1", + "magic-string": "^0.30.17", + "source-map-js": "^1.2.1", + "tailwindcss": "4.1.11" + } + }, + "node_modules/@tailwindcss/node/node_modules/tailwindcss": { + "version": "4.1.11", + "resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-4.1.11.tgz", + "integrity": "sha512-2E9TBm6MDD/xKYe+dvJZAmg3yxIEDNRc0jwlNyDg/4Fil2QcSLjFKGVff0lAf1jjeaArlG/M75Ey/EYr/OJtBA==", + "dev": true, + "license": "MIT" + }, + "node_modules/@tailwindcss/oxide": { + "version": "4.1.11", + "resolved": "https://registry.npmjs.org/@tailwindcss/oxide/-/oxide-4.1.11.tgz", + "integrity": "sha512-Q69XzrtAhuyfHo+5/HMgr1lAiPP/G40OMFAnws7xcFEYqcypZmdW8eGXaOUIeOl1dzPJBPENXgbjsOyhg2nkrg==", + "dev": true, + "hasInstallScript": true, + "license": "MIT", + "dependencies": { + "detect-libc": "^2.0.4", + "tar": "^7.4.3" + }, + "engines": { + "node": ">= 10" + }, + "optionalDependencies": { + "@tailwindcss/oxide-android-arm64": "4.1.11", + "@tailwindcss/oxide-darwin-arm64": "4.1.11", + "@tailwindcss/oxide-darwin-x64": "4.1.11", + "@tailwindcss/oxide-freebsd-x64": "4.1.11", + "@tailwindcss/oxide-linux-arm-gnueabihf": "4.1.11", + "@tailwindcss/oxide-linux-arm64-gnu": "4.1.11", + "@tailwindcss/oxide-linux-arm64-musl": "4.1.11", + "@tailwindcss/oxide-linux-x64-gnu": "4.1.11", + "@tailwindcss/oxide-linux-x64-musl": "4.1.11", + "@tailwindcss/oxide-wasm32-wasi": "4.1.11", + "@tailwindcss/oxide-win32-arm64-msvc": "4.1.11", + "@tailwindcss/oxide-win32-x64-msvc": "4.1.11" + } + }, + "node_modules/@tailwindcss/oxide-android-arm64": { + "version": "4.1.11", + "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-android-arm64/-/oxide-android-arm64-4.1.11.tgz", + "integrity": "sha512-3IfFuATVRUMZZprEIx9OGDjG3Ou3jG4xQzNTvjDoKmU9JdmoCohQJ83MYd0GPnQIu89YoJqvMM0G3uqLRFtetg==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@tailwindcss/oxide-darwin-arm64": { + "version": "4.1.11", + "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-darwin-arm64/-/oxide-darwin-arm64-4.1.11.tgz", + "integrity": "sha512-ESgStEOEsyg8J5YcMb1xl8WFOXfeBmrhAwGsFxxB2CxY9evy63+AtpbDLAyRkJnxLy2WsD1qF13E97uQyP1lfQ==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@tailwindcss/oxide-darwin-x64": { + "version": "4.1.11", + "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-darwin-x64/-/oxide-darwin-x64-4.1.11.tgz", + "integrity": "sha512-EgnK8kRchgmgzG6jE10UQNaH9Mwi2n+yw1jWmof9Vyg2lpKNX2ioe7CJdf9M5f8V9uaQxInenZkOxnTVL3fhAw==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@tailwindcss/oxide-freebsd-x64": { + "version": "4.1.11", + "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-freebsd-x64/-/oxide-freebsd-x64-4.1.11.tgz", + "integrity": "sha512-xdqKtbpHs7pQhIKmqVpxStnY1skuNh4CtbcyOHeX1YBE0hArj2romsFGb6yUmzkq/6M24nkxDqU8GYrKrz+UcA==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "freebsd" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@tailwindcss/oxide-linux-arm-gnueabihf": { + "version": "4.1.11", + "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-linux-arm-gnueabihf/-/oxide-linux-arm-gnueabihf-4.1.11.tgz", + "integrity": "sha512-ryHQK2eyDYYMwB5wZL46uoxz2zzDZsFBwfjssgB7pzytAeCCa6glsiJGjhTEddq/4OsIjsLNMAiMlHNYnkEEeg==", + "cpu": [ + "arm" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@tailwindcss/oxide-linux-arm64-gnu": { + "version": "4.1.11", + "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-linux-arm64-gnu/-/oxide-linux-arm64-gnu-4.1.11.tgz", + "integrity": "sha512-mYwqheq4BXF83j/w75ewkPJmPZIqqP1nhoghS9D57CLjsh3Nfq0m4ftTotRYtGnZd3eCztgbSPJ9QhfC91gDZQ==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@tailwindcss/oxide-linux-arm64-musl": { + "version": "4.1.11", + "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-linux-arm64-musl/-/oxide-linux-arm64-musl-4.1.11.tgz", + "integrity": "sha512-m/NVRFNGlEHJrNVk3O6I9ggVuNjXHIPoD6bqay/pubtYC9QIdAMpS+cswZQPBLvVvEF6GtSNONbDkZrjWZXYNQ==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@tailwindcss/oxide-linux-x64-gnu": { + "version": "4.1.11", + "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-linux-x64-gnu/-/oxide-linux-x64-gnu-4.1.11.tgz", + "integrity": "sha512-YW6sblI7xukSD2TdbbaeQVDysIm/UPJtObHJHKxDEcW2exAtY47j52f8jZXkqE1krdnkhCMGqP3dbniu1Te2Fg==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@tailwindcss/oxide-linux-x64-musl": { + "version": "4.1.11", + "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-linux-x64-musl/-/oxide-linux-x64-musl-4.1.11.tgz", + "integrity": "sha512-e3C/RRhGunWYNC3aSF7exsQkdXzQ/M+aYuZHKnw4U7KQwTJotnWsGOIVih0s2qQzmEzOFIJ3+xt7iq67K/p56Q==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@tailwindcss/oxide-wasm32-wasi": { + "version": "4.1.11", + "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-wasm32-wasi/-/oxide-wasm32-wasi-4.1.11.tgz", + "integrity": "sha512-Xo1+/GU0JEN/C/dvcammKHzeM6NqKovG+6921MR6oadee5XPBaKOumrJCXvopJ/Qb5TH7LX/UAywbqrP4lax0g==", + "bundleDependencies": [ + "@napi-rs/wasm-runtime", + "@emnapi/core", + "@emnapi/runtime", + "@tybys/wasm-util", + "@emnapi/wasi-threads", + "tslib" + ], + "cpu": [ + "wasm32" + ], + "dev": true, + "license": "MIT", + "optional": true, + "dependencies": { + "@emnapi/core": "^1.4.3", + "@emnapi/runtime": "^1.4.3", + "@emnapi/wasi-threads": "^1.0.2", + "@napi-rs/wasm-runtime": "^0.2.11", + "@tybys/wasm-util": "^0.9.0", + "tslib": "^2.8.0" + }, + "engines": { + "node": ">=14.0.0" + } + }, + "node_modules/@tailwindcss/oxide-win32-arm64-msvc": { + "version": "4.1.11", + "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-win32-arm64-msvc/-/oxide-win32-arm64-msvc-4.1.11.tgz", + "integrity": "sha512-UgKYx5PwEKrac3GPNPf6HVMNhUIGuUh4wlDFR2jYYdkX6pL/rn73zTq/4pzUm8fOjAn5L8zDeHp9iXmUGOXZ+w==", + "cpu": [ + "arm64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@tailwindcss/oxide-win32-x64-msvc": { + "version": "4.1.11", + "resolved": "https://registry.npmjs.org/@tailwindcss/oxide-win32-x64-msvc/-/oxide-win32-x64-msvc-4.1.11.tgz", + "integrity": "sha512-YfHoggn1j0LK7wR82TOucWc5LDCguHnoS879idHekmmiR7g9HUtMw9MI0NHatS28u/Xlkfi9w5RJWgz2Dl+5Qg==", + "cpu": [ + "x64" + ], + "dev": true, + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@tailwindcss/typography": { + "version": "0.5.16", + "resolved": "https://registry.npmjs.org/@tailwindcss/typography/-/typography-0.5.16.tgz", + "integrity": "sha512-0wDLwCVF5V3x3b1SGXPCDcdsbDHMBe+lkFzBRaHeLvNi+nrrnZ1lA18u+OTWO8iSWU2GxUOCvlXtDuqftc1oiA==", + "dev": true, + "license": "MIT", + "dependencies": { + "lodash.castarray": "^4.4.0", + "lodash.isplainobject": "^4.0.6", + "lodash.merge": "^4.6.2", + "postcss-selector-parser": "6.0.10" + }, + "peerDependencies": { + "tailwindcss": ">=3.0.0 || insiders || >=4.0.0-alpha.20 || >=4.0.0-beta.1" + } + }, + "node_modules/@tailwindcss/vite": { + "version": "4.1.11", + "resolved": "https://registry.npmjs.org/@tailwindcss/vite/-/vite-4.1.11.tgz", + "integrity": "sha512-RHYhrR3hku0MJFRV+fN2gNbDNEh3dwKvY8XJvTxCSXeMOsCRSr+uKvDWQcbizrHgjML6ZmTE5OwMrl5wKcujCw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@tailwindcss/node": "4.1.11", + "@tailwindcss/oxide": "4.1.11", + "tailwindcss": "4.1.11" + }, + "peerDependencies": { + "vite": "^5.2.0 || ^6 || ^7" + } + }, + "node_modules/@tailwindcss/vite/node_modules/tailwindcss": { + "version": "4.1.11", + "resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-4.1.11.tgz", + "integrity": "sha512-2E9TBm6MDD/xKYe+dvJZAmg3yxIEDNRc0jwlNyDg/4Fil2QcSLjFKGVff0lAf1jjeaArlG/M75Ey/EYr/OJtBA==", + "dev": true, + "license": "MIT" + }, + "node_modules/@testing-library/dom": { + "version": "10.4.1", + "resolved": "https://registry.npmjs.org/@testing-library/dom/-/dom-10.4.1.tgz", + "integrity": "sha512-o4PXJQidqJl82ckFaXUeoAW+XysPLauYI43Abki5hABd853iMhitooc6znOnczgbTYmEP6U6/y1ZyKAIsvMKGg==", + "dev": true, + "license": "MIT", + "peer": true, + "dependencies": { + "@babel/code-frame": "^7.10.4", + "@babel/runtime": "^7.12.5", + "@types/aria-query": "^5.0.1", + "aria-query": "5.3.0", + "dom-accessibility-api": "^0.5.9", + "lz-string": "^1.5.0", + "picocolors": "1.1.1", + "pretty-format": "^27.0.2" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/@testing-library/jest-dom": { + "version": "6.9.1", + "resolved": "https://registry.npmjs.org/@testing-library/jest-dom/-/jest-dom-6.9.1.tgz", + "integrity": "sha512-zIcONa+hVtVSSep9UT3jZ5rizo2BsxgyDYU7WFD5eICBE7no3881HGeb/QkGfsJs6JTkY1aQhT7rIPC7e+0nnA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@adobe/css-tools": "^4.4.0", + "aria-query": "^5.0.0", + "css.escape": "^1.5.1", + "dom-accessibility-api": "^0.6.3", + "picocolors": "^1.1.1", + "redent": "^3.0.0" + }, + "engines": { + "node": ">=14", + "npm": ">=6", + "yarn": ">=1" + } + }, + "node_modules/@testing-library/jest-dom/node_modules/dom-accessibility-api": { + "version": "0.6.3", + "resolved": "https://registry.npmjs.org/dom-accessibility-api/-/dom-accessibility-api-0.6.3.tgz", + "integrity": "sha512-7ZgogeTnjuHbo+ct10G9Ffp0mif17idi0IyWNVA/wcwcm7NPOD/WEHVP3n7n3MhXqxoIYm8d6MuZohYWIZ4T3w==", + "dev": true, + "license": "MIT" + }, + "node_modules/@testing-library/svelte-core": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/@testing-library/svelte-core/-/svelte-core-1.0.0.tgz", + "integrity": "sha512-VkUePoLV6oOYwSUvX6ShA8KLnJqZiYMIbP2JW2t0GLWLkJxKGvuH5qrrZBV/X7cXFnLGuFQEC7RheYiZOW68KQ==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=16" + }, + "peerDependencies": { + "svelte": "^3 || ^4 || ^5 || ^5.0.0-next.0" + } + }, + "node_modules/@testing-library/user-event": { + "version": "14.6.1", + "resolved": "https://registry.npmjs.org/@testing-library/user-event/-/user-event-14.6.1.tgz", + "integrity": "sha512-vq7fv0rnt+QTXgPxr5Hjc210p6YKq2kmdziLgnsZGgLJ9e6VAShx1pACLuRjd/AS/sr7phAR58OIIpf0LlmQNw==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=12", + "npm": ">=6" + }, + "peerDependencies": { + "@testing-library/dom": ">=7.21.4" + } + }, + "node_modules/@trickfilm400/rollup-plugin-off-main-thread": { + "version": "3.0.0-pre1", + "resolved": "https://registry.npmjs.org/@trickfilm400/rollup-plugin-off-main-thread/-/rollup-plugin-off-main-thread-3.0.0-pre1.tgz", + "integrity": "sha512-/67zpWDBLV+oYAEL682s1ktXL0HgqX76f6gaVGkGnVZlBbm1zd0v4Bz8MFF2GGhoX9rvfq3KSQHubFHwa6w6/Q==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "ejs": "^3.1.10", + "json5": "^2.2.3", + "magic-string": "^0.30.21", + "string.prototype.matchall": "^4.0.12" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/@tybys/wasm-util": { + "version": "0.10.2", + "resolved": "https://registry.npmjs.org/@tybys/wasm-util/-/wasm-util-0.10.2.tgz", + "integrity": "sha512-RoBvJ2X0wuKlWFIjrwffGw1IqZHKQqzIchKaadZZfnNpsAYp2mM0h36JtPCjNDAHGgYez/15uMBpfGwchhiMgg==", + "dev": true, + "license": "MIT", + "optional": true, + "dependencies": { + "tslib": "^2.4.0" + } + }, + "node_modules/@types/aria-query": { + "version": "5.0.4", + "resolved": "https://registry.npmjs.org/@types/aria-query/-/aria-query-5.0.4.tgz", + "integrity": "sha512-rfT93uj5s0PRL7EzccGMs3brplhcrghnDoV26NqKhCAS1hVo+WdNsPvE/yb6ilfr5hi2MEk6d5EWJTKdxg8jVw==", + "dev": true, + "license": "MIT", + "peer": true + }, + "node_modules/@types/chai": { + "version": "5.2.3", + "resolved": "https://registry.npmjs.org/@types/chai/-/chai-5.2.3.tgz", + "integrity": "sha512-Mw558oeA9fFbv65/y4mHtXDs9bPnFMZAL/jxdPFUpOHHIXX91mcgEHbS5Lahr+pwZFR8A7GQleRWeI6cGFC2UA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/deep-eql": "*", + "assertion-error": "^2.0.1" + } + }, + "node_modules/@types/cookie": { + "version": "0.6.0", + "resolved": "https://registry.npmjs.org/@types/cookie/-/cookie-0.6.0.tgz", + "integrity": "sha512-4Kh9a6B2bQciAhf7FSuMRRkUWecJgJu9nPnx3yzpsfXX/c50REIqpHY4C82bXP90qrLtXtkDxTZosYO3UpOwlA==", + "dev": true, + "license": "MIT" + }, + "node_modules/@types/d3": { + "version": "7.4.3", + "resolved": "https://registry.npmjs.org/@types/d3/-/d3-7.4.3.tgz", + "integrity": "sha512-lZXZ9ckh5R8uiFVt8ogUNf+pIrK4EsWrx2Np75WvF/eTpJ0FMHNhjXk8CKEx/+gpHbNQyJWehbFaTvqmHWB3ww==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/d3-array": "*", + "@types/d3-axis": "*", + "@types/d3-brush": "*", + "@types/d3-chord": "*", + "@types/d3-color": "*", + "@types/d3-contour": "*", + "@types/d3-delaunay": "*", + "@types/d3-dispatch": "*", + "@types/d3-drag": "*", + "@types/d3-dsv": "*", + "@types/d3-ease": "*", + "@types/d3-fetch": "*", + "@types/d3-force": "*", + "@types/d3-format": "*", + "@types/d3-geo": "*", + "@types/d3-hierarchy": "*", + "@types/d3-interpolate": "*", + "@types/d3-path": "*", + "@types/d3-polygon": "*", + "@types/d3-quadtree": "*", + "@types/d3-random": "*", + "@types/d3-scale": "*", + "@types/d3-scale-chromatic": "*", + "@types/d3-selection": "*", + "@types/d3-shape": "*", + "@types/d3-time": "*", + "@types/d3-time-format": "*", + "@types/d3-timer": "*", + "@types/d3-transition": "*", + "@types/d3-zoom": "*" + } + }, + "node_modules/@types/d3-array": { + "version": "3.2.2", + "resolved": "https://registry.npmjs.org/@types/d3-array/-/d3-array-3.2.2.tgz", + "integrity": "sha512-hOLWVbm7uRza0BYXpIIW5pxfrKe0W+D5lrFiAEYR+pb6w3N2SwSMaJbXdUfSEv+dT4MfHBLtn5js0LAWaO6otw==", + "dev": true, + "license": "MIT" + }, + "node_modules/@types/d3-axis": { + "version": "3.0.6", + "resolved": "https://registry.npmjs.org/@types/d3-axis/-/d3-axis-3.0.6.tgz", + "integrity": "sha512-pYeijfZuBd87T0hGn0FO1vQ/cgLk6E1ALJjfkC0oJ8cbwkZl3TpgS8bVBLZN+2jjGgg38epgxb2zmoGtSfvgMw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/d3-selection": "*" + } + }, + "node_modules/@types/d3-brush": { + "version": "3.0.6", + "resolved": "https://registry.npmjs.org/@types/d3-brush/-/d3-brush-3.0.6.tgz", + "integrity": "sha512-nH60IZNNxEcrh6L1ZSMNA28rj27ut/2ZmI3r96Zd+1jrZD++zD3LsMIjWlvg4AYrHn/Pqz4CF3veCxGjtbqt7A==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/d3-selection": "*" + } + }, + "node_modules/@types/d3-chord": { + "version": "3.0.6", + "resolved": "https://registry.npmjs.org/@types/d3-chord/-/d3-chord-3.0.6.tgz", + "integrity": "sha512-LFYWWd8nwfwEmTZG9PfQxd17HbNPksHBiJHaKuY1XeqscXacsS2tyoo6OdRsjf+NQYeB6XrNL3a25E3gH69lcg==", + "dev": true, + "license": "MIT" + }, + "node_modules/@types/d3-color": { + "version": "3.1.3", + "resolved": "https://registry.npmjs.org/@types/d3-color/-/d3-color-3.1.3.tgz", + "integrity": "sha512-iO90scth9WAbmgv7ogoq57O9YpKmFBbmoEoCHDB2xMBY0+/KVrqAaCDyCE16dUspeOvIxFFRI+0sEtqDqy2b4A==", + "dev": true, + "license": "MIT" + }, + "node_modules/@types/d3-contour": { + "version": "3.0.6", + "resolved": "https://registry.npmjs.org/@types/d3-contour/-/d3-contour-3.0.6.tgz", + "integrity": "sha512-BjzLgXGnCWjUSYGfH1cpdo41/hgdWETu4YxpezoztawmqsvCeep+8QGfiY6YbDvfgHz/DkjeIkkZVJavB4a3rg==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/d3-array": "*", + "@types/geojson": "*" } }, "node_modules/@types/d3-delaunay": { @@ -3015,3979 +5834,5155 @@ "resolved": "https://registry.npmjs.org/@types/d3-delaunay/-/d3-delaunay-6.0.4.tgz", "integrity": "sha512-ZMaSKu4THYCU6sV64Lhg6qjf1orxBthaC161plr5KuPHo3CNm8DTHiLw/5Eq2b6TsNP0W0iJrUOFscY6Q450Hw==", "dev": true, - "license": "MIT" + "license": "MIT" + }, + "node_modules/@types/d3-dispatch": { + "version": "3.0.7", + "resolved": "https://registry.npmjs.org/@types/d3-dispatch/-/d3-dispatch-3.0.7.tgz", + "integrity": "sha512-5o9OIAdKkhN1QItV2oqaE5KMIiXAvDWBDPrD85e58Qlz1c1kI/J0NcqbEG88CoTwJrYe7ntUCVfeUl2UJKbWgA==", + "dev": true, + "license": "MIT" + }, + "node_modules/@types/d3-drag": { + "version": "3.0.7", + "resolved": "https://registry.npmjs.org/@types/d3-drag/-/d3-drag-3.0.7.tgz", + "integrity": "sha512-HE3jVKlzU9AaMazNufooRJ5ZpWmLIoc90A37WU2JMmeq28w1FQqCZswHZ3xR+SuxYftzHq6WU6KJHvqxKzTxxQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/d3-selection": "*" + } + }, + "node_modules/@types/d3-dsv": { + "version": "3.0.7", + "resolved": "https://registry.npmjs.org/@types/d3-dsv/-/d3-dsv-3.0.7.tgz", + "integrity": "sha512-n6QBF9/+XASqcKK6waudgL0pf/S5XHPPI8APyMLLUHd8NqouBGLsU8MgtO7NINGtPBtk9Kko/W4ea0oAspwh9g==", + "dev": true, + "license": "MIT" + }, + "node_modules/@types/d3-ease": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/@types/d3-ease/-/d3-ease-3.0.2.tgz", + "integrity": "sha512-NcV1JjO5oDzoK26oMzbILE6HW7uVXOHLQvHshBUW4UMdZGfiY6v5BeQwh9a9tCzv+CeefZQHJt5SRgK154RtiA==", + "dev": true, + "license": "MIT" + }, + "node_modules/@types/d3-fetch": { + "version": "3.0.7", + "resolved": "https://registry.npmjs.org/@types/d3-fetch/-/d3-fetch-3.0.7.tgz", + "integrity": "sha512-fTAfNmxSb9SOWNB9IoG5c8Hg6R+AzUHDRlsXsDZsNp6sxAEOP0tkP3gKkNSO/qmHPoBFTxNrjDprVHDQDvo5aA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/d3-dsv": "*" + } + }, + "node_modules/@types/d3-force": { + "version": "3.0.10", + "resolved": "https://registry.npmjs.org/@types/d3-force/-/d3-force-3.0.10.tgz", + "integrity": "sha512-ZYeSaCF3p73RdOKcjj+swRlZfnYpK1EbaDiYICEEp5Q6sUiqFaFQ9qgoshp5CzIyyb/yD09kD9o2zEltCexlgw==", + "dev": true, + "license": "MIT" + }, + "node_modules/@types/d3-format": { + "version": "3.0.4", + "resolved": "https://registry.npmjs.org/@types/d3-format/-/d3-format-3.0.4.tgz", + "integrity": "sha512-fALi2aI6shfg7vM5KiR1wNJnZ7r6UuggVqtDA+xiEdPZQwy/trcQaHnwShLuLdta2rTymCNpxYTiMZX/e09F4g==", + "dev": true, + "license": "MIT" + }, + "node_modules/@types/d3-geo": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/@types/d3-geo/-/d3-geo-3.1.0.tgz", + "integrity": "sha512-856sckF0oP/diXtS4jNsiQw/UuK5fQG8l/a9VVLeSouf1/PPbBE1i1W852zVwKwYCBkFJJB7nCFTbk6UMEXBOQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/geojson": "*" + } + }, + "node_modules/@types/d3-hierarchy": { + "version": "3.1.7", + "resolved": "https://registry.npmjs.org/@types/d3-hierarchy/-/d3-hierarchy-3.1.7.tgz", + "integrity": "sha512-tJFtNoYBtRtkNysX1Xq4sxtjK8YgoWUNpIiUee0/jHGRwqvzYxkq0hGVbbOGSz+JgFxxRu4K8nb3YpG3CMARtg==", + "dev": true, + "license": "MIT" + }, + "node_modules/@types/d3-interpolate": { + "version": "3.0.4", + "resolved": "https://registry.npmjs.org/@types/d3-interpolate/-/d3-interpolate-3.0.4.tgz", + "integrity": "sha512-mgLPETlrpVV1YRJIglr4Ez47g7Yxjl1lj7YKsiMCb27VJH9W8NVM6Bb9d8kkpG/uAQS5AmbA48q2IAolKKo1MA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/d3-color": "*" + } + }, + "node_modules/@types/d3-path": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/@types/d3-path/-/d3-path-3.1.1.tgz", + "integrity": "sha512-VMZBYyQvbGmWyWVea0EHs/BwLgxc+MKi1zLDCONksozI4YJMcTt8ZEuIR4Sb1MMTE8MMW49v0IwI5+b7RmfWlg==", + "dev": true, + "license": "MIT" + }, + "node_modules/@types/d3-polygon": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/@types/d3-polygon/-/d3-polygon-3.0.2.tgz", + "integrity": "sha512-ZuWOtMaHCkN9xoeEMr1ubW2nGWsp4nIql+OPQRstu4ypeZ+zk3YKqQT0CXVe/PYqrKpZAi+J9mTs05TKwjXSRA==", + "dev": true, + "license": "MIT" + }, + "node_modules/@types/d3-quadtree": { + "version": "3.0.6", + "resolved": "https://registry.npmjs.org/@types/d3-quadtree/-/d3-quadtree-3.0.6.tgz", + "integrity": "sha512-oUzyO1/Zm6rsxKRHA1vH0NEDG58HrT5icx/azi9MF1TWdtttWl0UIUsjEQBBh+SIkrpd21ZjEv7ptxWys1ncsg==", + "dev": true, + "license": "MIT" + }, + "node_modules/@types/d3-random": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/@types/d3-random/-/d3-random-3.0.3.tgz", + "integrity": "sha512-Imagg1vJ3y76Y2ea0871wpabqp613+8/r0mCLEBfdtqC7xMSfj9idOnmBYyMoULfHePJyxMAw3nWhJxzc+LFwQ==", + "dev": true, + "license": "MIT" + }, + "node_modules/@types/d3-scale": { + "version": "4.0.9", + "resolved": "https://registry.npmjs.org/@types/d3-scale/-/d3-scale-4.0.9.tgz", + "integrity": "sha512-dLmtwB8zkAeO/juAMfnV+sItKjlsw2lKdZVVy6LRr0cBmegxSABiLEpGVmSJJ8O08i4+sGR6qQtb6WtuwJdvVw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/d3-time": "*" + } + }, + "node_modules/@types/d3-scale-chromatic": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/@types/d3-scale-chromatic/-/d3-scale-chromatic-3.1.0.tgz", + "integrity": "sha512-iWMJgwkK7yTRmWqRB5plb1kadXyQ5Sj8V/zYlFGMUBbIPKQScw+Dku9cAAMgJG+z5GYDoMjWGLVOvjghDEFnKQ==", + "dev": true, + "license": "MIT" + }, + "node_modules/@types/d3-selection": { + "version": "3.0.11", + "resolved": "https://registry.npmjs.org/@types/d3-selection/-/d3-selection-3.0.11.tgz", + "integrity": "sha512-bhAXu23DJWsrI45xafYpkQ4NtcKMwWnAC/vKrd2l+nxMFuvOT3XMYTIj2opv8vq8AO5Yh7Qac/nSeP/3zjTK0w==", + "dev": true, + "license": "MIT" + }, + "node_modules/@types/d3-shape": { + "version": "3.1.8", + "resolved": "https://registry.npmjs.org/@types/d3-shape/-/d3-shape-3.1.8.tgz", + "integrity": "sha512-lae0iWfcDeR7qt7rA88BNiqdvPS5pFVPpo5OfjElwNaT2yyekbM0C9vK+yqBqEmHr6lDkRnYNoTBYlAgJa7a4w==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/d3-path": "*" + } + }, + "node_modules/@types/d3-time": { + "version": "3.0.4", + "resolved": "https://registry.npmjs.org/@types/d3-time/-/d3-time-3.0.4.tgz", + "integrity": "sha512-yuzZug1nkAAaBlBBikKZTgzCeA+k1uy4ZFwWANOfKw5z5LRhV0gNA7gNkKm7HoK+HRN0wX3EkxGk0fpbWhmB7g==", + "dev": true, + "license": "MIT" + }, + "node_modules/@types/d3-time-format": { + "version": "4.0.3", + "resolved": "https://registry.npmjs.org/@types/d3-time-format/-/d3-time-format-4.0.3.tgz", + "integrity": "sha512-5xg9rC+wWL8kdDj153qZcsJ0FWiFt0J5RB6LYUNZjwSnesfblqrI/bJ1wBdJ8OQfncgbJG5+2F+qfqnqyzYxyg==", + "dev": true, + "license": "MIT" + }, + "node_modules/@types/d3-timer": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/@types/d3-timer/-/d3-timer-3.0.2.tgz", + "integrity": "sha512-Ps3T8E8dZDam6fUyNiMkekK3XUsaUEik+idO9/YjPtfj2qruF8tFBXS7XhtE4iIXBLxhmLjP3SXpLhVf21I9Lw==", + "dev": true, + "license": "MIT" + }, + "node_modules/@types/d3-transition": { + "version": "3.0.9", + "resolved": "https://registry.npmjs.org/@types/d3-transition/-/d3-transition-3.0.9.tgz", + "integrity": "sha512-uZS5shfxzO3rGlu0cC3bjmMFKsXv+SmZZcgp0KD22ts4uGXp5EVYGzu/0YdwZeKmddhcAccYtREJKkPfXkZuCg==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/d3-selection": "*" + } + }, + "node_modules/@types/d3-zoom": { + "version": "3.0.8", + "resolved": "https://registry.npmjs.org/@types/d3-zoom/-/d3-zoom-3.0.8.tgz", + "integrity": "sha512-iqMC4/YlFCSlO8+2Ii1GGGliCAY4XdeG748w5vQUbevlbDu0zSjH/+jojorQVBK/se0j6DUFNPBGSqD3YWYnDw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/d3-interpolate": "*", + "@types/d3-selection": "*" + } + }, + "node_modules/@types/debug": { + "version": "4.1.13", + "resolved": "https://registry.npmjs.org/@types/debug/-/debug-4.1.13.tgz", + "integrity": "sha512-KSVgmQmzMwPlmtljOomayoR89W4FynCAi3E8PPs7vmDVPe84hT+vGPKkJfThkmXs0x0jAaa9U8uW8bbfyS2fWw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/ms": "*" + } + }, + "node_modules/@types/deep-eql": { + "version": "4.0.2", + "resolved": "https://registry.npmjs.org/@types/deep-eql/-/deep-eql-4.0.2.tgz", + "integrity": "sha512-c9h9dVVMigMPc4bwTvC5dxqtqJZwQPePsWjPlpSOnojbor6pGqdk541lfA7AqFQr5pB1BRdq0juY9db81BwyFw==", + "dev": true, + "license": "MIT" + }, + "node_modules/@types/estree": { + "version": "1.0.9", + "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.9.tgz", + "integrity": "sha512-GhdPgy1el4/ImP05X05Uw4cw2/M93BCUmnEvWZNStlCzEKME4Fkk+YpoA5OiHNQmoS7Cafb8Xa3Pya8m1Qrzeg==", + "dev": true, + "license": "MIT" + }, + "node_modules/@types/geojson": { + "version": "7946.0.16", + "resolved": "https://registry.npmjs.org/@types/geojson/-/geojson-7946.0.16.tgz", + "integrity": "sha512-6C8nqWur3j98U6+lXDfTUWIfgvZU+EumvpHKcYjujKH7woYyLj2sUmff0tRhrqM7BohUw7Pz3ZB1jj2gW9Fvmg==", + "dev": true, + "license": "MIT" + }, + "node_modules/@types/hast": { + "version": "3.0.4", + "resolved": "https://registry.npmjs.org/@types/hast/-/hast-3.0.4.tgz", + "integrity": "sha512-WPs+bbQw5aCj+x6laNGWLH3wviHtoCv/P3+otBhbOhJgG8qtpdAMlTCxLtsTWA7LH1Oh/bFCHsBn0TPS5m30EQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/unist": "*" + } + }, + "node_modules/@types/json-schema": { + "version": "7.0.15", + "resolved": "https://registry.npmjs.org/@types/json-schema/-/json-schema-7.0.15.tgz", + "integrity": "sha512-5+fP8P8MFNC+AyZCDxrB2pkZFPGzqQWUzpSeuuVLvm8VMcorNYavBqoFcxK8bQz4Qsbn4oUEEem4wDLfcysGHA==", + "dev": true, + "license": "MIT" + }, + "node_modules/@types/katex": { + "version": "0.16.8", + "resolved": "https://registry.npmjs.org/@types/katex/-/katex-0.16.8.tgz", + "integrity": "sha512-trgaNyfU+Xh2Tc+ABIb44a5AYUpicB3uwirOioeOkNPPbmgRNtcWyDeeFRzjPZENO9Vq8gvVqfhaaXWLlevVwg==", + "dev": true, + "license": "MIT" + }, + "node_modules/@types/mdast": { + "version": "4.0.4", + "resolved": "https://registry.npmjs.org/@types/mdast/-/mdast-4.0.4.tgz", + "integrity": "sha512-kGaNbPh1k7AFzgpud/gMdvIm5xuECykRR+JnWKQno9TAXVa6WIVCGTPvYGekIDL4uwCZQSYbUxNBSb1aUo79oA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/unist": "*" + } + }, + "node_modules/@types/mdx": { + "version": "2.0.13", + "resolved": "https://registry.npmjs.org/@types/mdx/-/mdx-2.0.13.tgz", + "integrity": "sha512-+OWZQfAYyio6YkJb3HLxDrvnx6SWWDbC0zVPfBRzUk0/nqoDyf6dNxQi3eArPe8rJ473nobTMQ/8Zk+LxJ+Yuw==", + "dev": true, + "license": "MIT" + }, + "node_modules/@types/ms": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/@types/ms/-/ms-2.1.0.tgz", + "integrity": "sha512-GsCCIZDE/p3i96vtEqx+7dBUGXrc7zeSK3wwPHIaRThS+9OhWIXRqzs4d6k1SVU8g91DrNRWxWUGhp5KXQb2VA==", + "dev": true, + "license": "MIT" + }, + "node_modules/@types/node": { + "version": "24.13.0", + "resolved": "https://registry.npmjs.org/@types/node/-/node-24.13.0.tgz", + "integrity": "sha512-5vtOqGQr4NJKeEzV441FcOi2MeG9UTWq9LqVLGneDdu4vlX17H8kQ2PA2UmNwCUGPVDj4oBjNhS7ReVEIWJJrg==", + "dev": true, + "license": "MIT", + "dependencies": { + "undici-types": "~7.18.0" + } + }, + "node_modules/@types/react": { + "version": "19.2.17", + "resolved": "https://registry.npmjs.org/@types/react/-/react-19.2.17.tgz", + "integrity": "sha512-MXfmqaVPEVgkBT/aY0aGCkRWWtByiYQXo3xdQ8r5RzuFrPiRn8Gar2tQdXSUQ2GKV3bkXckek89V8wQBY2Q/Aw==", + "dev": true, + "license": "MIT", + "peer": true, + "dependencies": { + "csstype": "^3.2.2" + } + }, + "node_modules/@types/resolve": { + "version": "1.20.2", + "resolved": "https://registry.npmjs.org/@types/resolve/-/resolve-1.20.2.tgz", + "integrity": "sha512-60BCwRFOZCQhDncwQdxxeOEEkbc5dIMccYLwbxsS4TUNeVECQ/pBJ0j09mrHOl/JJvpRPGwO9SvE4nR2Nb/a4Q==", + "dev": true, + "license": "MIT" + }, + "node_modules/@types/trusted-types": { + "version": "2.0.7", + "resolved": "https://registry.npmjs.org/@types/trusted-types/-/trusted-types-2.0.7.tgz", + "integrity": "sha512-ScaPdn1dQczgbl0QFTeTOmVHFULt394XJgOQNoyVhZ6r2vLnMLJfBPd53SB52T/3G36VI1/g2MZaX0cwDuXsfw==", + "dev": true, + "license": "MIT" + }, + "node_modules/@types/unist": { + "version": "2.0.11", + "resolved": "https://registry.npmjs.org/@types/unist/-/unist-2.0.11.tgz", + "integrity": "sha512-CmBKiL6NNo/OqgmMn95Fk9Whlp2mtvIv+KNpQKN2F4SjvrEesubTRWGYSg+BnWZOnlCaSTU1sMpsBOzgbYhnsA==", + "dev": true, + "license": "MIT" + }, + "node_modules/@typescript-eslint/eslint-plugin": { + "version": "8.60.1", + "resolved": "https://registry.npmjs.org/@typescript-eslint/eslint-plugin/-/eslint-plugin-8.60.1.tgz", + "integrity": "sha512-JQ4S5GB0tfjO8BuJ4fcX+HodkzJjYBV+7OJ+wLygaX7OGQ7FudyHL4NSCA6ob+w3Yn+5MkKIozOwQhXeM7opVg==", + "dev": true, + "license": "MIT", + "dependencies": { + "@eslint-community/regexpp": "^4.12.2", + "@typescript-eslint/scope-manager": "8.60.1", + "@typescript-eslint/type-utils": "8.60.1", + "@typescript-eslint/utils": "8.60.1", + "@typescript-eslint/visitor-keys": "8.60.1", + "ignore": "^7.0.5", + "natural-compare": "^1.4.0", + "ts-api-utils": "^2.5.0" + }, + "engines": { + "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/typescript-eslint" + }, + "peerDependencies": { + "@typescript-eslint/parser": "^8.60.1", + "eslint": "^8.57.0 || ^9.0.0 || ^10.0.0", + "typescript": ">=4.8.4 <6.1.0" + } + }, + "node_modules/@typescript-eslint/eslint-plugin/node_modules/ignore": { + "version": "7.0.5", + "resolved": "https://registry.npmjs.org/ignore/-/ignore-7.0.5.tgz", + "integrity": "sha512-Hs59xBNfUIunMFgWAbGX5cq6893IbWg4KnrjbYwX3tx0ztorVgTDA6B2sxf8ejHJ4wz8BqGUMYlnzNBer5NvGg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 4" + } + }, + "node_modules/@typescript-eslint/parser": { + "version": "8.60.1", + "resolved": "https://registry.npmjs.org/@typescript-eslint/parser/-/parser-8.60.1.tgz", + "integrity": "sha512-A0M6ua6H252bVjPvvtSgl2QA4+ET9S5Mtkb2GDyTxIhH/C4qDItT7RQNO5PhMC6NXGYXOR9dIalcDDgBKT7oFA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@typescript-eslint/scope-manager": "8.60.1", + "@typescript-eslint/types": "8.60.1", + "@typescript-eslint/typescript-estree": "8.60.1", + "@typescript-eslint/visitor-keys": "8.60.1", + "debug": "^4.4.3" + }, + "engines": { + "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/typescript-eslint" + }, + "peerDependencies": { + "eslint": "^8.57.0 || ^9.0.0 || ^10.0.0", + "typescript": ">=4.8.4 <6.1.0" + } + }, + "node_modules/@typescript-eslint/project-service": { + "version": "8.60.1", + "resolved": "https://registry.npmjs.org/@typescript-eslint/project-service/-/project-service-8.60.1.tgz", + "integrity": "sha512-eXkTH2bxmXlqD1RnOPmLZ9ZM9D3VwSx04JOwBnP9RQ+yUA5a2Mu7SfW8uaV2Aon53NJzZlZYuX7tn91Izf+xaw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@typescript-eslint/tsconfig-utils": "^8.60.1", + "@typescript-eslint/types": "^8.60.1", + "debug": "^4.4.3" + }, + "engines": { + "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/typescript-eslint" + }, + "peerDependencies": { + "typescript": ">=4.8.4 <6.1.0" + } + }, + "node_modules/@typescript-eslint/scope-manager": { + "version": "8.60.1", + "resolved": "https://registry.npmjs.org/@typescript-eslint/scope-manager/-/scope-manager-8.60.1.tgz", + "integrity": "sha512-gvI5OQoptnxQnchOirukCuQ55svJSTuD/4k5+pC267xyBtYry748R9/c3tYUzb/iE6RZfllRz2lVulLCHkTm4w==", + "dev": true, + "license": "MIT", + "dependencies": { + "@typescript-eslint/types": "8.60.1", + "@typescript-eslint/visitor-keys": "8.60.1" + }, + "engines": { + "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/typescript-eslint" + } + }, + "node_modules/@typescript-eslint/tsconfig-utils": { + "version": "8.60.1", + "resolved": "https://registry.npmjs.org/@typescript-eslint/tsconfig-utils/-/tsconfig-utils-8.60.1.tgz", + "integrity": "sha512-nh8w4qAteiKuZu3pSSzG/yGKpw0OlkrKnzFmbVRenKaD4qc+7i1GrmZaLVkr8rk4uipiPGMOW4YsM6WmKZ5CvA==", + "dev": true, + "license": "MIT", + "engines": { + "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/typescript-eslint" + }, + "peerDependencies": { + "typescript": ">=4.8.4 <6.1.0" + } + }, + "node_modules/@typescript-eslint/type-utils": { + "version": "8.60.1", + "resolved": "https://registry.npmjs.org/@typescript-eslint/type-utils/-/type-utils-8.60.1.tgz", + "integrity": "sha512-sdwTrpjosW7ANQYJ39ZBF1ZyEMEGVB2UsikrserVM/30a/F1dTLnu9bGxEdosugyu5caigjLrR2qiD11asjI1A==", + "dev": true, + "license": "MIT", + "dependencies": { + "@typescript-eslint/types": "8.60.1", + "@typescript-eslint/typescript-estree": "8.60.1", + "@typescript-eslint/utils": "8.60.1", + "debug": "^4.4.3", + "ts-api-utils": "^2.5.0" + }, + "engines": { + "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/typescript-eslint" + }, + "peerDependencies": { + "eslint": "^8.57.0 || ^9.0.0 || ^10.0.0", + "typescript": ">=4.8.4 <6.1.0" + } + }, + "node_modules/@typescript-eslint/types": { + "version": "8.60.1", + "resolved": "https://registry.npmjs.org/@typescript-eslint/types/-/types-8.60.1.tgz", + "integrity": "sha512-4h0tY8ppCkdCzcrl2YM5M3my0xsE1Tf8om3owEu5oPWmXwkKRmk0j0LGDzYBGUcAlesEbxBhazqu/K4cu3Ug7w==", + "dev": true, + "license": "MIT", + "engines": { + "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/typescript-eslint" + } + }, + "node_modules/@typescript-eslint/typescript-estree": { + "version": "8.60.1", + "resolved": "https://registry.npmjs.org/@typescript-eslint/typescript-estree/-/typescript-estree-8.60.1.tgz", + "integrity": "sha512-alpRkfG8hlVE5kdJW2GkfgDgXxold3e8e4l6EnmhRmRLbekgAPCCGDVD++sABy9FcgPFroq+uFcCSM1vR57Cew==", + "dev": true, + "license": "MIT", + "dependencies": { + "@typescript-eslint/project-service": "8.60.1", + "@typescript-eslint/tsconfig-utils": "8.60.1", + "@typescript-eslint/types": "8.60.1", + "@typescript-eslint/visitor-keys": "8.60.1", + "debug": "^4.4.3", + "minimatch": "^10.2.2", + "semver": "^7.7.3", + "tinyglobby": "^0.2.15", + "ts-api-utils": "^2.5.0" + }, + "engines": { + "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/typescript-eslint" + }, + "peerDependencies": { + "typescript": ">=4.8.4 <6.1.0" + } + }, + "node_modules/@typescript-eslint/typescript-estree/node_modules/balanced-match": { + "version": "4.0.4", + "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-4.0.4.tgz", + "integrity": "sha512-BLrgEcRTwX2o6gGxGOCNyMvGSp35YofuYzw9h1IMTRmKqttAZZVU67bdb9Pr2vUHA8+j3i2tJfjO6C6+4myGTA==", + "dev": true, + "license": "MIT", + "engines": { + "node": "18 || 20 || >=22" + } + }, + "node_modules/@typescript-eslint/typescript-estree/node_modules/brace-expansion": { + "version": "5.0.6", + "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-5.0.6.tgz", + "integrity": "sha512-kLpxurY4Z4r9sgMsyG0Z9uzsBlgiU/EFKhj/h91/8yHu0edo7XuixOIH3VcJ8kkxs6/jPzoI6U9Vj3WqbMQ94g==", + "dev": true, + "license": "MIT", + "dependencies": { + "balanced-match": "^4.0.2" + }, + "engines": { + "node": "18 || 20 || >=22" + } + }, + "node_modules/@typescript-eslint/typescript-estree/node_modules/minimatch": { + "version": "10.2.5", + "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-10.2.5.tgz", + "integrity": "sha512-MULkVLfKGYDFYejP07QOurDLLQpcjk7Fw+7jXS2R2czRQzR56yHRveU5NDJEOviH+hETZKSkIk5c+T23GjFUMg==", + "dev": true, + "license": "BlueOak-1.0.0", + "dependencies": { + "brace-expansion": "^5.0.5" + }, + "engines": { + "node": "18 || 20 || >=22" + }, + "funding": { + "url": "https://github.com/sponsors/isaacs" + } + }, + "node_modules/@typescript-eslint/utils": { + "version": "8.60.1", + "resolved": "https://registry.npmjs.org/@typescript-eslint/utils/-/utils-8.60.1.tgz", + "integrity": "sha512-h2MPBLoNtjc3qZWfY3Tl51yPorQ2McHn8pJfcMNTcIvrrZrr90Ykffit0yjrPFWQcRcUxzH20+6OcVdW4yHtUg==", + "dev": true, + "license": "MIT", + "dependencies": { + "@eslint-community/eslint-utils": "^4.9.1", + "@typescript-eslint/scope-manager": "8.60.1", + "@typescript-eslint/types": "8.60.1", + "@typescript-eslint/typescript-estree": "8.60.1" + }, + "engines": { + "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/typescript-eslint" + }, + "peerDependencies": { + "eslint": "^8.57.0 || ^9.0.0 || ^10.0.0", + "typescript": ">=4.8.4 <6.1.0" + } + }, + "node_modules/@typescript-eslint/visitor-keys": { + "version": "8.60.1", + "resolved": "https://registry.npmjs.org/@typescript-eslint/visitor-keys/-/visitor-keys-8.60.1.tgz", + "integrity": "sha512-EbGRQg4FhrmwLodl+t3JNAnXHWVr9Vp+Zl1QBZVPY4ByfkzIT8cX3K6QWODHtkIZqqJVEWvhHSx3v5PDHsaQag==", + "dev": true, + "license": "MIT", + "dependencies": { + "@typescript-eslint/types": "8.60.1", + "eslint-visitor-keys": "^5.0.0" + }, + "engines": { + "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/typescript-eslint" + } + }, + "node_modules/@typescript-eslint/visitor-keys/node_modules/eslint-visitor-keys": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-5.0.1.tgz", + "integrity": "sha512-tD40eHxA35h0PEIZNeIjkHoDR4YjjJp34biM0mDvplBe//mB+IHCqHDGV7pxF+7MklTvighcCPPZC7ynWyjdTA==", + "dev": true, + "license": "Apache-2.0", + "engines": { + "node": "^20.19.0 || ^22.13.0 || >=24" + }, + "funding": { + "url": "https://opencollective.com/eslint" + } + }, + "node_modules/@ungap/structured-clone": { + "version": "1.3.1", + "resolved": "https://registry.npmjs.org/@ungap/structured-clone/-/structured-clone-1.3.1.tgz", + "integrity": "sha512-mUFwbeTqrVgDQxFveS+df2yfap6iuP20NAKAsBt5jDEoOTDew+zwLAOilHCeQJOVSvmgCX4ogqIrA0mnyr08yQ==", + "dev": true, + "license": "ISC" + }, + "node_modules/@upsetjs/venn.js": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/@upsetjs/venn.js/-/venn.js-2.0.0.tgz", + "integrity": "sha512-WbBhLrooyePuQ1VZxrJjtLvTc4NVfpOyKx0sKqioq9bX1C1m7Jgykkn8gLrtwumBioXIqam8DLxp88Adbue6Hw==", + "dev": true, + "license": "MIT", + "optionalDependencies": { + "d3-selection": "^3.0.0", + "d3-transition": "^3.0.1" + } + }, + "node_modules/@vite-pwa/assets-generator": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/@vite-pwa/assets-generator/-/assets-generator-1.0.2.tgz", + "integrity": "sha512-MCbrb508JZHqe7bUibmZj/lyojdhLRnfkmyXnkrCM2zVrjTgL89U8UEfInpKTvPeTnxsw2hmyZxnhsdNR6yhwg==", + "dev": true, + "license": "MIT", + "dependencies": { + "cac": "^6.7.14", + "colorette": "^2.0.20", + "consola": "^3.4.2", + "sharp": "^0.33.5", + "sharp-ico": "^0.1.5", + "unconfig": "^7.3.1" + }, + "bin": { + "pwa-assets-generator": "bin/pwa-assets-generator.mjs" + }, + "engines": { + "node": ">=16.14.0" + }, + "funding": { + "url": "https://github.com/sponsors/antfu" + } + }, + "node_modules/@vite-pwa/sveltekit": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@vite-pwa/sveltekit/-/sveltekit-1.1.0.tgz", + "integrity": "sha512-mMIf2tY+7Hg8jecpu/WY+Ki2ikoXy3hVmt3tOxi0K+lYYnKQrDYthuHireI0S+26Mg9BXzL7qQF1xeB5VYlYlg==", + "dev": true, + "license": "MIT", + "dependencies": { + "kolorist": "^1.8.0", + "tinyglobby": "^0.2.9", + "vite-plugin-pwa": "^1.2.0" + }, + "engines": { + "node": ">=18.13" + }, + "funding": { + "url": "https://github.com/sponsors/antfu" + }, + "peerDependencies": { + "@sveltejs/kit": "^1.3.1 || ^2.0.1", + "@vite-pwa/assets-generator": "^1.0.0" + }, + "peerDependenciesMeta": { + "@vite-pwa/assets-generator": { + "optional": true + } + } + }, + "node_modules/@vitest/browser": { + "version": "4.1.8", + "resolved": "https://registry.npmjs.org/@vitest/browser/-/browser-4.1.8.tgz", + "integrity": "sha512-u21VzX07HzlJYpFgkxmjEXar/tG2UqWGgyGG/46SrrPc7rSdCTPw5vuowopO9CIqF8UCUQzDFdbVnNpw6N0BfQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@blazediff/core": "1.9.1", + "@vitest/mocker": "4.1.8", + "@vitest/utils": "4.1.8", + "magic-string": "^0.30.21", + "pngjs": "^7.0.0", + "sirv": "^3.0.2", + "tinyrainbow": "^3.1.0", + "ws": "^8.19.0" + }, + "funding": { + "url": "https://opencollective.com/vitest" + }, + "peerDependencies": { + "vitest": "4.1.8" + } + }, + "node_modules/@vitest/browser-playwright": { + "version": "4.1.8", + "resolved": "https://registry.npmjs.org/@vitest/browser-playwright/-/browser-playwright-4.1.8.tgz", + "integrity": "sha512-SR7FqgegaexEg73xvf3ArtygXegagMdXnL0EZMpxrWvvhQxvicD/E8p0ib0J91riPRtQUViyh67Xjw3NqvyhVg==", + "dev": true, + "license": "MIT", + "dependencies": { + "@vitest/browser": "4.1.8", + "@vitest/mocker": "4.1.8", + "tinyrainbow": "^3.1.0" + }, + "funding": { + "url": "https://opencollective.com/vitest" + }, + "peerDependencies": { + "playwright": "*", + "vitest": "4.1.8" + }, + "peerDependenciesMeta": { + "playwright": { + "optional": false + } + } }, - "node_modules/@types/d3-dispatch": { - "version": "3.0.7", - "resolved": "https://registry.npmjs.org/@types/d3-dispatch/-/d3-dispatch-3.0.7.tgz", - "integrity": "sha512-5o9OIAdKkhN1QItV2oqaE5KMIiXAvDWBDPrD85e58Qlz1c1kI/J0NcqbEG88CoTwJrYe7ntUCVfeUl2UJKbWgA==", + "node_modules/@vitest/browser-playwright/node_modules/tinyrainbow": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/tinyrainbow/-/tinyrainbow-3.1.0.tgz", + "integrity": "sha512-Bf+ILmBgretUrdJxzXM0SgXLZ3XfiaUuOj/IKQHuTXip+05Xn+uyEYdVg0kYDipTBcLrCVyUzAPz7QmArb0mmw==", "dev": true, - "license": "MIT" + "license": "MIT", + "engines": { + "node": ">=14.0.0" + } }, - "node_modules/@types/d3-drag": { - "version": "3.0.7", - "resolved": "https://registry.npmjs.org/@types/d3-drag/-/d3-drag-3.0.7.tgz", - "integrity": "sha512-HE3jVKlzU9AaMazNufooRJ5ZpWmLIoc90A37WU2JMmeq28w1FQqCZswHZ3xR+SuxYftzHq6WU6KJHvqxKzTxxQ==", + "node_modules/@vitest/browser/node_modules/tinyrainbow": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/tinyrainbow/-/tinyrainbow-3.1.0.tgz", + "integrity": "sha512-Bf+ILmBgretUrdJxzXM0SgXLZ3XfiaUuOj/IKQHuTXip+05Xn+uyEYdVg0kYDipTBcLrCVyUzAPz7QmArb0mmw==", "dev": true, "license": "MIT", - "dependencies": { - "@types/d3-selection": "*" + "engines": { + "node": ">=14.0.0" } }, - "node_modules/@types/d3-dsv": { - "version": "3.0.7", - "resolved": "https://registry.npmjs.org/@types/d3-dsv/-/d3-dsv-3.0.7.tgz", - "integrity": "sha512-n6QBF9/+XASqcKK6waudgL0pf/S5XHPPI8APyMLLUHd8NqouBGLsU8MgtO7NINGtPBtk9Kko/W4ea0oAspwh9g==", + "node_modules/@vitest/coverage-v8": { + "version": "4.1.8", + "resolved": "https://registry.npmjs.org/@vitest/coverage-v8/-/coverage-v8-4.1.8.tgz", + "integrity": "sha512-lt3kovsyHwYe00wq4D1ti0Z974fWj4NLp6siqiyEufUpyFwK9Yhi7rBhac9JL5aA0zoMrJqc4vYPZRUnI7l7nw==", "dev": true, - "license": "MIT" + "license": "MIT", + "dependencies": { + "@bcoe/v8-coverage": "^1.0.2", + "@vitest/utils": "4.1.8", + "ast-v8-to-istanbul": "^1.0.0", + "istanbul-lib-coverage": "^3.2.2", + "istanbul-lib-report": "^3.0.1", + "istanbul-reports": "^3.2.0", + "magicast": "^0.5.2", + "obug": "^2.1.1", + "std-env": "^4.0.0-rc.1", + "tinyrainbow": "^3.1.0" + }, + "funding": { + "url": "https://opencollective.com/vitest" + }, + "peerDependencies": { + "@vitest/browser": "4.1.8", + "vitest": "4.1.8" + }, + "peerDependenciesMeta": { + "@vitest/browser": { + "optional": true + } + } }, - "node_modules/@types/d3-ease": { - "version": "3.0.2", - "resolved": "https://registry.npmjs.org/@types/d3-ease/-/d3-ease-3.0.2.tgz", - "integrity": "sha512-NcV1JjO5oDzoK26oMzbILE6HW7uVXOHLQvHshBUW4UMdZGfiY6v5BeQwh9a9tCzv+CeefZQHJt5SRgK154RtiA==", + "node_modules/@vitest/coverage-v8/node_modules/tinyrainbow": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/tinyrainbow/-/tinyrainbow-3.1.0.tgz", + "integrity": "sha512-Bf+ILmBgretUrdJxzXM0SgXLZ3XfiaUuOj/IKQHuTXip+05Xn+uyEYdVg0kYDipTBcLrCVyUzAPz7QmArb0mmw==", "dev": true, - "license": "MIT" + "license": "MIT", + "engines": { + "node": ">=14.0.0" + } }, - "node_modules/@types/d3-fetch": { - "version": "3.0.7", - "resolved": "https://registry.npmjs.org/@types/d3-fetch/-/d3-fetch-3.0.7.tgz", - "integrity": "sha512-fTAfNmxSb9SOWNB9IoG5c8Hg6R+AzUHDRlsXsDZsNp6sxAEOP0tkP3gKkNSO/qmHPoBFTxNrjDprVHDQDvo5aA==", + "node_modules/@vitest/expect": { + "version": "3.2.4", + "resolved": "https://registry.npmjs.org/@vitest/expect/-/expect-3.2.4.tgz", + "integrity": "sha512-Io0yyORnB6sikFlt8QW5K7slY4OjqNX9jmJQ02QDda8lyM6B5oNgVWoSoKPac8/kgnCUzuHQKrSLtu/uOqqrig==", "dev": true, "license": "MIT", "dependencies": { - "@types/d3-dsv": "*" + "@types/chai": "^5.2.2", + "@vitest/spy": "3.2.4", + "@vitest/utils": "3.2.4", + "chai": "^5.2.0", + "tinyrainbow": "^2.0.0" + }, + "funding": { + "url": "https://opencollective.com/vitest" } }, - "node_modules/@types/d3-force": { - "version": "3.0.10", - "resolved": "https://registry.npmjs.org/@types/d3-force/-/d3-force-3.0.10.tgz", - "integrity": "sha512-ZYeSaCF3p73RdOKcjj+swRlZfnYpK1EbaDiYICEEp5Q6sUiqFaFQ9qgoshp5CzIyyb/yD09kD9o2zEltCexlgw==", + "node_modules/@vitest/expect/node_modules/@vitest/pretty-format": { + "version": "3.2.4", + "resolved": "https://registry.npmjs.org/@vitest/pretty-format/-/pretty-format-3.2.4.tgz", + "integrity": "sha512-IVNZik8IVRJRTr9fxlitMKeJeXFFFN0JaB9PHPGQ8NKQbGpfjlTx9zO4RefN8gp7eqjNy8nyK3NZmBzOPeIxtA==", "dev": true, - "license": "MIT" + "license": "MIT", + "dependencies": { + "tinyrainbow": "^2.0.0" + }, + "funding": { + "url": "https://opencollective.com/vitest" + } }, - "node_modules/@types/d3-format": { - "version": "3.0.4", - "resolved": "https://registry.npmjs.org/@types/d3-format/-/d3-format-3.0.4.tgz", - "integrity": "sha512-fALi2aI6shfg7vM5KiR1wNJnZ7r6UuggVqtDA+xiEdPZQwy/trcQaHnwShLuLdta2rTymCNpxYTiMZX/e09F4g==", + "node_modules/@vitest/expect/node_modules/@vitest/spy": { + "version": "3.2.4", + "resolved": "https://registry.npmjs.org/@vitest/spy/-/spy-3.2.4.tgz", + "integrity": "sha512-vAfasCOe6AIK70iP5UD11Ac4siNUNJ9i/9PZ3NKx07sG6sUxeag1LWdNrMWeKKYBLlzuK+Gn65Yd5nyL6ds+nw==", "dev": true, - "license": "MIT" + "license": "MIT", + "dependencies": { + "tinyspy": "^4.0.3" + }, + "funding": { + "url": "https://opencollective.com/vitest" + } }, - "node_modules/@types/d3-geo": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/@types/d3-geo/-/d3-geo-3.1.0.tgz", - "integrity": "sha512-856sckF0oP/diXtS4jNsiQw/UuK5fQG8l/a9VVLeSouf1/PPbBE1i1W852zVwKwYCBkFJJB7nCFTbk6UMEXBOQ==", + "node_modules/@vitest/expect/node_modules/@vitest/utils": { + "version": "3.2.4", + "resolved": "https://registry.npmjs.org/@vitest/utils/-/utils-3.2.4.tgz", + "integrity": "sha512-fB2V0JFrQSMsCo9HiSq3Ezpdv4iYaXRG1Sx8edX3MwxfyNn83mKiGzOcH+Fkxt4MHxr3y42fQi1oeAInqgX2QA==", "dev": true, "license": "MIT", "dependencies": { - "@types/geojson": "*" + "@vitest/pretty-format": "3.2.4", + "loupe": "^3.1.4", + "tinyrainbow": "^2.0.0" + }, + "funding": { + "url": "https://opencollective.com/vitest" } }, - "node_modules/@types/d3-hierarchy": { - "version": "3.1.7", - "resolved": "https://registry.npmjs.org/@types/d3-hierarchy/-/d3-hierarchy-3.1.7.tgz", - "integrity": "sha512-tJFtNoYBtRtkNysX1Xq4sxtjK8YgoWUNpIiUee0/jHGRwqvzYxkq0hGVbbOGSz+JgFxxRu4K8nb3YpG3CMARtg==", + "node_modules/@vitest/mocker": { + "version": "4.1.8", + "resolved": "https://registry.npmjs.org/@vitest/mocker/-/mocker-4.1.8.tgz", + "integrity": "sha512-LEiN/xe4OSIbKe9HQIp5OC24agGD9J5CnmMgsLohVVoOPWL9a2sBoR6VBx43jQZb7Kr1l4RCuyCJzcAa0+dojw==", "dev": true, - "license": "MIT" + "license": "MIT", + "dependencies": { + "@vitest/spy": "4.1.8", + "estree-walker": "^3.0.3", + "magic-string": "^0.30.21" + }, + "funding": { + "url": "https://opencollective.com/vitest" + }, + "peerDependencies": { + "msw": "^2.4.9", + "vite": "^6.0.0 || ^7.0.0 || ^8.0.0" + }, + "peerDependenciesMeta": { + "msw": { + "optional": true + }, + "vite": { + "optional": true + } + } }, - "node_modules/@types/d3-interpolate": { - "version": "3.0.4", - "resolved": "https://registry.npmjs.org/@types/d3-interpolate/-/d3-interpolate-3.0.4.tgz", - "integrity": "sha512-mgLPETlrpVV1YRJIglr4Ez47g7Yxjl1lj7YKsiMCb27VJH9W8NVM6Bb9d8kkpG/uAQS5AmbA48q2IAolKKo1MA==", + "node_modules/@vitest/pretty-format": { + "version": "4.1.8", + "resolved": "https://registry.npmjs.org/@vitest/pretty-format/-/pretty-format-4.1.8.tgz", + "integrity": "sha512-9GasEBxpZ1VYIpqHf/0+YGg121uSNwCKOJqIrTwWP/TB7DmFCiaBpNl3aPZzoLWfWkuqhbH8vJIVobZkvdo2cA==", "dev": true, "license": "MIT", "dependencies": { - "@types/d3-color": "*" + "tinyrainbow": "^3.1.0" + }, + "funding": { + "url": "https://opencollective.com/vitest" } }, - "node_modules/@types/d3-path": { - "version": "3.1.1", - "resolved": "https://registry.npmjs.org/@types/d3-path/-/d3-path-3.1.1.tgz", - "integrity": "sha512-VMZBYyQvbGmWyWVea0EHs/BwLgxc+MKi1zLDCONksozI4YJMcTt8ZEuIR4Sb1MMTE8MMW49v0IwI5+b7RmfWlg==", + "node_modules/@vitest/pretty-format/node_modules/tinyrainbow": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/tinyrainbow/-/tinyrainbow-3.1.0.tgz", + "integrity": "sha512-Bf+ILmBgretUrdJxzXM0SgXLZ3XfiaUuOj/IKQHuTXip+05Xn+uyEYdVg0kYDipTBcLrCVyUzAPz7QmArb0mmw==", "dev": true, - "license": "MIT" + "license": "MIT", + "engines": { + "node": ">=14.0.0" + } }, - "node_modules/@types/d3-polygon": { - "version": "3.0.2", - "resolved": "https://registry.npmjs.org/@types/d3-polygon/-/d3-polygon-3.0.2.tgz", - "integrity": "sha512-ZuWOtMaHCkN9xoeEMr1ubW2nGWsp4nIql+OPQRstu4ypeZ+zk3YKqQT0CXVe/PYqrKpZAi+J9mTs05TKwjXSRA==", + "node_modules/@vitest/runner": { + "version": "4.1.8", + "resolved": "https://registry.npmjs.org/@vitest/runner/-/runner-4.1.8.tgz", + "integrity": "sha512-EmVxeBAfMJvycdjd6Hm+RbFBbA9fKvo0Kx37hNpBYoYeavH3RNsBXWDooR1mgD52dCrxIIuP7UotpfiwOikvcg==", "dev": true, - "license": "MIT" + "license": "MIT", + "dependencies": { + "@vitest/utils": "4.1.8", + "pathe": "^2.0.3" + }, + "funding": { + "url": "https://opencollective.com/vitest" + } }, - "node_modules/@types/d3-quadtree": { - "version": "3.0.6", - "resolved": "https://registry.npmjs.org/@types/d3-quadtree/-/d3-quadtree-3.0.6.tgz", - "integrity": "sha512-oUzyO1/Zm6rsxKRHA1vH0NEDG58HrT5icx/azi9MF1TWdtttWl0UIUsjEQBBh+SIkrpd21ZjEv7ptxWys1ncsg==", + "node_modules/@vitest/snapshot": { + "version": "4.1.8", + "resolved": "https://registry.npmjs.org/@vitest/snapshot/-/snapshot-4.1.8.tgz", + "integrity": "sha512-acfZboRmAIf05DEKcBQy33VXojFJjtUdLyo7oOmV9kebb2xdU01UknNiPuPZoJZQyO7DF0gZdTGTpeAzET9QPQ==", "dev": true, - "license": "MIT" + "license": "MIT", + "dependencies": { + "@vitest/pretty-format": "4.1.8", + "@vitest/utils": "4.1.8", + "magic-string": "^0.30.21", + "pathe": "^2.0.3" + }, + "funding": { + "url": "https://opencollective.com/vitest" + } }, - "node_modules/@types/d3-random": { - "version": "3.0.3", - "resolved": "https://registry.npmjs.org/@types/d3-random/-/d3-random-3.0.3.tgz", - "integrity": "sha512-Imagg1vJ3y76Y2ea0871wpabqp613+8/r0mCLEBfdtqC7xMSfj9idOnmBYyMoULfHePJyxMAw3nWhJxzc+LFwQ==", + "node_modules/@vitest/spy": { + "version": "4.1.8", + "resolved": "https://registry.npmjs.org/@vitest/spy/-/spy-4.1.8.tgz", + "integrity": "sha512-6EevtBp6OZOPF7bmz36HrGMeP3txgVSrgebWxHOafDXGkhIzfXK14f8KF6MuFfgXXUeHxmpD3BQxkV00/3s5mA==", "dev": true, - "license": "MIT" + "license": "MIT", + "funding": { + "url": "https://opencollective.com/vitest" + } }, - "node_modules/@types/d3-scale": { - "version": "4.0.9", - "resolved": "https://registry.npmjs.org/@types/d3-scale/-/d3-scale-4.0.9.tgz", - "integrity": "sha512-dLmtwB8zkAeO/juAMfnV+sItKjlsw2lKdZVVy6LRr0cBmegxSABiLEpGVmSJJ8O08i4+sGR6qQtb6WtuwJdvVw==", + "node_modules/@vitest/utils": { + "version": "4.1.8", + "resolved": "https://registry.npmjs.org/@vitest/utils/-/utils-4.1.8.tgz", + "integrity": "sha512-uOJamYALNhfJ6iolExyQM40yIQwDqYnkKtQ5VCiSe17E33H0aQ/u+1GlRuz4LZBk6Mm3sg90G9hEbmEt37C1Zg==", "dev": true, "license": "MIT", "dependencies": { - "@types/d3-time": "*" + "@vitest/pretty-format": "4.1.8", + "convert-source-map": "^2.0.0", + "tinyrainbow": "^3.1.0" + }, + "funding": { + "url": "https://opencollective.com/vitest" } }, - "node_modules/@types/d3-scale-chromatic": { + "node_modules/@vitest/utils/node_modules/tinyrainbow": { "version": "3.1.0", - "resolved": "https://registry.npmjs.org/@types/d3-scale-chromatic/-/d3-scale-chromatic-3.1.0.tgz", - "integrity": "sha512-iWMJgwkK7yTRmWqRB5plb1kadXyQ5Sj8V/zYlFGMUBbIPKQScw+Dku9cAAMgJG+z5GYDoMjWGLVOvjghDEFnKQ==", + "resolved": "https://registry.npmjs.org/tinyrainbow/-/tinyrainbow-3.1.0.tgz", + "integrity": "sha512-Bf+ILmBgretUrdJxzXM0SgXLZ3XfiaUuOj/IKQHuTXip+05Xn+uyEYdVg0kYDipTBcLrCVyUzAPz7QmArb0mmw==", "dev": true, - "license": "MIT" + "license": "MIT", + "engines": { + "node": ">=14.0.0" + } }, - "node_modules/@types/d3-selection": { - "version": "3.0.11", - "resolved": "https://registry.npmjs.org/@types/d3-selection/-/d3-selection-3.0.11.tgz", - "integrity": "sha512-bhAXu23DJWsrI45xafYpkQ4NtcKMwWnAC/vKrd2l+nxMFuvOT3XMYTIj2opv8vq8AO5Yh7Qac/nSeP/3zjTK0w==", + "node_modules/@webcontainer/env": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/@webcontainer/env/-/env-1.1.1.tgz", + "integrity": "sha512-6aN99yL695Hi9SuIk1oC88l9o0gmxL1nGWWQ/kNy81HigJ0FoaoTXpytCj6ItzgyCEwA9kF1wixsTuv5cjsgng==", "dev": true, "license": "MIT" }, - "node_modules/@types/d3-shape": { - "version": "3.1.8", - "resolved": "https://registry.npmjs.org/@types/d3-shape/-/d3-shape-3.1.8.tgz", - "integrity": "sha512-lae0iWfcDeR7qt7rA88BNiqdvPS5pFVPpo5OfjElwNaT2yyekbM0C9vK+yqBqEmHr6lDkRnYNoTBYlAgJa7a4w==", + "node_modules/accepts": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/accepts/-/accepts-2.0.0.tgz", + "integrity": "sha512-5cvg6CtKwfgdmVqY1WIiXKc3Q1bkRqGLi+2W/6ao+6Y7gu/RCwRuAhGEzh5B4KlszSuTLgZYuqFqo5bImjNKng==", "dev": true, "license": "MIT", "dependencies": { - "@types/d3-path": "*" + "mime-types": "^3.0.0", + "negotiator": "^1.0.0" + }, + "engines": { + "node": ">= 0.6" } }, - "node_modules/@types/d3-time": { - "version": "3.0.4", - "resolved": "https://registry.npmjs.org/@types/d3-time/-/d3-time-3.0.4.tgz", - "integrity": "sha512-yuzZug1nkAAaBlBBikKZTgzCeA+k1uy4ZFwWANOfKw5z5LRhV0gNA7gNkKm7HoK+HRN0wX3EkxGk0fpbWhmB7g==", - "dev": true, - "license": "MIT" - }, - "node_modules/@types/d3-time-format": { - "version": "4.0.3", - "resolved": "https://registry.npmjs.org/@types/d3-time-format/-/d3-time-format-4.0.3.tgz", - "integrity": "sha512-5xg9rC+wWL8kdDj153qZcsJ0FWiFt0J5RB6LYUNZjwSnesfblqrI/bJ1wBdJ8OQfncgbJG5+2F+qfqnqyzYxyg==", - "dev": true, - "license": "MIT" - }, - "node_modules/@types/d3-timer": { - "version": "3.0.2", - "resolved": "https://registry.npmjs.org/@types/d3-timer/-/d3-timer-3.0.2.tgz", - "integrity": "sha512-Ps3T8E8dZDam6fUyNiMkekK3XUsaUEik+idO9/YjPtfj2qruF8tFBXS7XhtE4iIXBLxhmLjP3SXpLhVf21I9Lw==", + "node_modules/acorn": { + "version": "8.16.0", + "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.16.0.tgz", + "integrity": "sha512-UVJyE9MttOsBQIDKw1skb9nAwQuR5wuGD3+82K6JgJlm/Y+KI92oNsMNGZCYdDsVtRHSak0pcV5Dno5+4jh9sw==", "dev": true, - "license": "MIT" + "license": "MIT", + "bin": { + "acorn": "bin/acorn" + }, + "engines": { + "node": ">=0.4.0" + } }, - "node_modules/@types/d3-transition": { - "version": "3.0.9", - "resolved": "https://registry.npmjs.org/@types/d3-transition/-/d3-transition-3.0.9.tgz", - "integrity": "sha512-uZS5shfxzO3rGlu0cC3bjmMFKsXv+SmZZcgp0KD22ts4uGXp5EVYGzu/0YdwZeKmddhcAccYtREJKkPfXkZuCg==", + "node_modules/acorn-jsx": { + "version": "5.3.2", + "resolved": "https://registry.npmjs.org/acorn-jsx/-/acorn-jsx-5.3.2.tgz", + "integrity": "sha512-rq9s+JNhf0IChjtDXxllJ7g41oZk5SlXtp0LHwyA5cejwn7vKmKp4pPri6YEePv2PU65sAsegbXtIinmDFDXgQ==", "dev": true, "license": "MIT", - "dependencies": { - "@types/d3-selection": "*" + "peerDependencies": { + "acorn": "^6.0.0 || ^7.0.0 || ^8.0.0" } }, - "node_modules/@types/d3-zoom": { - "version": "3.0.8", - "resolved": "https://registry.npmjs.org/@types/d3-zoom/-/d3-zoom-3.0.8.tgz", - "integrity": "sha512-iqMC4/YlFCSlO8+2Ii1GGGliCAY4XdeG748w5vQUbevlbDu0zSjH/+jojorQVBK/se0j6DUFNPBGSqD3YWYnDw==", + "node_modules/ajv": { + "version": "8.20.0", + "resolved": "https://registry.npmjs.org/ajv/-/ajv-8.20.0.tgz", + "integrity": "sha512-Thbli+OlOj+iMPYFBVBfJ3OmCAnaSyNn4M1vz9T6Gka5Jt9ba/HIR56joy65tY6kx/FCF5VXNB819Y7/GUrBGA==", "dev": true, "license": "MIT", "dependencies": { - "@types/d3-interpolate": "*", - "@types/d3-selection": "*" + "fast-deep-equal": "^3.1.3", + "fast-uri": "^3.0.1", + "json-schema-traverse": "^1.0.0", + "require-from-string": "^2.0.2" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/epoberezkin" } }, - "node_modules/@types/debug": { - "version": "4.1.12", - "resolved": "https://registry.npmjs.org/@types/debug/-/debug-4.1.12.tgz", - "integrity": "sha512-vIChWdVG3LG1SMxEvI/AK+FWJthlrqlTu7fbrlywTkkaONwk/UAGaULXRlf8vkzFBLVm0zkMdCquhL5aOjhXPQ==", + "node_modules/ajv-formats": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/ajv-formats/-/ajv-formats-3.0.1.tgz", + "integrity": "sha512-8iUql50EUR+uUcdRQ3HDqa6EVyo3docL8g5WJ3FNcWmu62IbkGUue/pEyLBW8VGKKucTPgqeks4fIU1DA4yowQ==", "dev": true, "license": "MIT", "dependencies": { - "@types/ms": "*" + "ajv": "^8.0.0" + }, + "peerDependencies": { + "ajv": "^8.0.0" + }, + "peerDependenciesMeta": { + "ajv": { + "optional": true + } } }, - "node_modules/@types/deep-eql": { - "version": "4.0.2", - "resolved": "https://registry.npmjs.org/@types/deep-eql/-/deep-eql-4.0.2.tgz", - "integrity": "sha512-c9h9dVVMigMPc4bwTvC5dxqtqJZwQPePsWjPlpSOnojbor6pGqdk541lfA7AqFQr5pB1BRdq0juY9db81BwyFw==", - "dev": true, - "license": "MIT" - }, - "node_modules/@types/estree": { - "version": "1.0.8", - "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.8.tgz", - "integrity": "sha512-dWHzHa2WqEXI/O1E9OjrocMTKJl2mSrEolh1Iomrv6U+JuNwaHXsXx9bLu5gG7BUWFIN0skIQJQ/L1rIex4X6w==", - "dev": true, - "license": "MIT" - }, - "node_modules/@types/geojson": { - "version": "7946.0.16", - "resolved": "https://registry.npmjs.org/@types/geojson/-/geojson-7946.0.16.tgz", - "integrity": "sha512-6C8nqWur3j98U6+lXDfTUWIfgvZU+EumvpHKcYjujKH7woYyLj2sUmff0tRhrqM7BohUw7Pz3ZB1jj2gW9Fvmg==", + "node_modules/ansi-regex": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz", + "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==", "dev": true, - "license": "MIT" + "license": "MIT", + "peer": true, + "engines": { + "node": ">=8" + } }, - "node_modules/@types/hast": { - "version": "3.0.4", - "resolved": "https://registry.npmjs.org/@types/hast/-/hast-3.0.4.tgz", - "integrity": "sha512-WPs+bbQw5aCj+x6laNGWLH3wviHtoCv/P3+otBhbOhJgG8qtpdAMlTCxLtsTWA7LH1Oh/bFCHsBn0TPS5m30EQ==", + "node_modules/ansi-styles": { + "version": "4.3.0", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz", + "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==", "dev": true, "license": "MIT", "dependencies": { - "@types/unist": "*" + "color-convert": "^2.0.1" + }, + "engines": { + "node": ">=8" + }, + "funding": { + "url": "https://github.com/chalk/ansi-styles?sponsor=1" } }, - "node_modules/@types/json-schema": { - "version": "7.0.15", - "resolved": "https://registry.npmjs.org/@types/json-schema/-/json-schema-7.0.15.tgz", - "integrity": "sha512-5+fP8P8MFNC+AyZCDxrB2pkZFPGzqQWUzpSeuuVLvm8VMcorNYavBqoFcxK8bQz4Qsbn4oUEEem4wDLfcysGHA==", + "node_modules/argparse": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/argparse/-/argparse-2.0.1.tgz", + "integrity": "sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q==", "dev": true, - "license": "MIT" + "license": "Python-2.0" }, - "node_modules/@types/katex": { - "version": "0.16.7", - "resolved": "https://registry.npmjs.org/@types/katex/-/katex-0.16.7.tgz", - "integrity": "sha512-HMwFiRujE5PjrgwHQ25+bsLJgowjGjm5Z8FVSf0N6PwgJrwxH0QxzHYDcKsTfV3wva0vzrpqMTJS2jXPr5BMEQ==", + "node_modules/aria-query": { + "version": "5.3.0", + "resolved": "https://registry.npmjs.org/aria-query/-/aria-query-5.3.0.tgz", + "integrity": "sha512-b0P0sZPKtyu8HkeRAfCq0IfURZK+SuwMjY1UXGBU27wpAiTwQAIlq56IbIO+ytk/JjS1fMR14ee5WBBfKi5J6A==", "dev": true, - "license": "MIT" + "license": "Apache-2.0", + "dependencies": { + "dequal": "^2.0.3" + } }, - "node_modules/@types/mdast": { - "version": "4.0.4", - "resolved": "https://registry.npmjs.org/@types/mdast/-/mdast-4.0.4.tgz", - "integrity": "sha512-kGaNbPh1k7AFzgpud/gMdvIm5xuECykRR+JnWKQno9TAXVa6WIVCGTPvYGekIDL4uwCZQSYbUxNBSb1aUo79oA==", + "node_modules/array-buffer-byte-length": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/array-buffer-byte-length/-/array-buffer-byte-length-1.0.2.tgz", + "integrity": "sha512-LHE+8BuR7RYGDKvnrmcuSq3tDcKv9OFEXQt/HpbZhY7V6h0zlUXutnAD82GiFx9rdieCMjkvtcsPqBwgUl1Iiw==", "dev": true, "license": "MIT", "dependencies": { - "@types/unist": "*" + "call-bound": "^1.0.3", + "is-array-buffer": "^3.0.5" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/@types/mdx": { - "version": "2.0.13", - "resolved": "https://registry.npmjs.org/@types/mdx/-/mdx-2.0.13.tgz", - "integrity": "sha512-+OWZQfAYyio6YkJb3HLxDrvnx6SWWDbC0zVPfBRzUk0/nqoDyf6dNxQi3eArPe8rJ473nobTMQ/8Zk+LxJ+Yuw==", + "node_modules/arraybuffer.prototype.slice": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/arraybuffer.prototype.slice/-/arraybuffer.prototype.slice-1.0.4.tgz", + "integrity": "sha512-BNoCY6SXXPQ7gF2opIP4GBE+Xw7U+pHMYKuzjgCN3GwiaIR09UUeKfheyIry77QtrCBlC0KK0q5/TER/tYh3PQ==", "dev": true, - "license": "MIT" + "license": "MIT", + "dependencies": { + "array-buffer-byte-length": "^1.0.1", + "call-bind": "^1.0.8", + "define-properties": "^1.2.1", + "es-abstract": "^1.23.5", + "es-errors": "^1.3.0", + "get-intrinsic": "^1.2.6", + "is-array-buffer": "^3.0.4" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } }, - "node_modules/@types/ms": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/@types/ms/-/ms-2.1.0.tgz", - "integrity": "sha512-GsCCIZDE/p3i96vtEqx+7dBUGXrc7zeSK3wwPHIaRThS+9OhWIXRqzs4d6k1SVU8g91DrNRWxWUGhp5KXQb2VA==", + "node_modules/assertion-error": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/assertion-error/-/assertion-error-2.0.1.tgz", + "integrity": "sha512-Izi8RQcffqCeNVgFigKli1ssklIbpHnCYc6AknXGYoB6grJqyeby7jv12JUQgmTAnIDnbck1uxksT4dzN3PWBA==", "dev": true, - "license": "MIT" + "license": "MIT", + "engines": { + "node": ">=12" + } }, - "node_modules/@types/node": { - "version": "24.10.10", - "resolved": "https://registry.npmjs.org/@types/node/-/node-24.10.10.tgz", - "integrity": "sha512-+0/4J266CBGPUq/ELg7QUHhN25WYjE0wYTPSQJn1xeu8DOlIOPxXxrNGiLmfAWl7HMMgWFWXpt9IDjMWrF5Iow==", + "node_modules/ast-types": { + "version": "0.16.1", + "resolved": "https://registry.npmjs.org/ast-types/-/ast-types-0.16.1.tgz", + "integrity": "sha512-6t10qk83GOG8p0vKmaCr8eiilZwO171AvbROMtvvNiwrTly62t+7XkA8RdIIVbpMhCASAsxgAzdRSwh6nw/5Dg==", "dev": true, "license": "MIT", "dependencies": { - "undici-types": "~7.16.0" + "tslib": "^2.0.1" + }, + "engines": { + "node": ">=4" } }, - "node_modules/@types/react": { - "version": "19.2.16", - "resolved": "https://registry.npmjs.org/@types/react/-/react-19.2.16.tgz", - "integrity": "sha512-esJiCAnl0kfpNdE69f3So4WJUXy95dLZydX0KwK46riIHDzHM7O9Vtf9xCHW0PXIqvgqNrswl522kA/5yx+F4w==", + "node_modules/ast-v8-to-istanbul": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/ast-v8-to-istanbul/-/ast-v8-to-istanbul-1.0.3.tgz", + "integrity": "sha512-jCMQ6ZylLPudp0CDfBmQBZUsrh1/8psbmu9ibeVWKuHWD0YrH9YABwlKu5kVEFoT0GCQQW9Z/SxfuEbbkGQCRg==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { - "csstype": "^3.2.2" + "@jridgewell/trace-mapping": "^0.3.31", + "estree-walker": "^3.0.3", + "js-tokens": "^10.0.0" } }, - "node_modules/@types/trusted-types": { - "version": "2.0.7", - "resolved": "https://registry.npmjs.org/@types/trusted-types/-/trusted-types-2.0.7.tgz", - "integrity": "sha512-ScaPdn1dQczgbl0QFTeTOmVHFULt394XJgOQNoyVhZ6r2vLnMLJfBPd53SB52T/3G36VI1/g2MZaX0cwDuXsfw==", + "node_modules/ast-v8-to-istanbul/node_modules/js-tokens": { + "version": "10.0.0", + "resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-10.0.0.tgz", + "integrity": "sha512-lM/UBzQmfJRo9ABXbPWemivdCW8V2G8FHaHdypQaIy523snUjog0W71ayWXTjiR+ixeMyVHN2XcpnTd/liPg/Q==", + "dev": true, + "license": "MIT" + }, + "node_modules/async": { + "version": "3.2.6", + "resolved": "https://registry.npmjs.org/async/-/async-3.2.6.tgz", + "integrity": "sha512-htCUDlxyyCLMgaM3xXg0C0LW2xqfuQ6p05pCEIsXuyQ+a1koYKTuBMzRNwmybfLgvJDMd0r1LTn4+E0Ti6C2AA==", "dev": true, "license": "MIT" }, - "node_modules/@types/unist": { - "version": "2.0.11", - "resolved": "https://registry.npmjs.org/@types/unist/-/unist-2.0.11.tgz", - "integrity": "sha512-CmBKiL6NNo/OqgmMn95Fk9Whlp2mtvIv+KNpQKN2F4SjvrEesubTRWGYSg+BnWZOnlCaSTU1sMpsBOzgbYhnsA==", + "node_modules/async-function": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/async-function/-/async-function-1.0.0.tgz", + "integrity": "sha512-hsU18Ae8CDTR6Kgu9DYf0EbCr/a5iGL0rytQDobUcdpYOKokk8LEjVphnXkDkgpi0wYVsqrXuP0bZxJaTqdgoA==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/at-least-node": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/at-least-node/-/at-least-node-1.0.0.tgz", + "integrity": "sha512-+q/t7Ekv1EDY2l6Gda6LLiX14rU9TV20Wa3ofeQmwPFZbOMo9DXrLbOjFaaclkXKWidIaopwAObQDqwWtGUjqg==", "dev": true, - "license": "MIT" + "license": "ISC", + "engines": { + "node": ">= 4.0.0" + } }, - "node_modules/@typescript-eslint/eslint-plugin": { - "version": "8.56.0", - "resolved": "https://registry.npmjs.org/@typescript-eslint/eslint-plugin/-/eslint-plugin-8.56.0.tgz", - "integrity": "sha512-lRyPDLzNCuae71A3t9NEINBiTn7swyOhvUj3MyUOxb8x6g6vPEFoOU+ZRmGMusNC3X3YMhqMIX7i8ShqhT74Pw==", + "node_modules/available-typed-arrays": { + "version": "1.0.7", + "resolved": "https://registry.npmjs.org/available-typed-arrays/-/available-typed-arrays-1.0.7.tgz", + "integrity": "sha512-wvUjBtSGN7+7SjNpq/9M2Tg350UZD3q62IFZLbRAR1bSMlCo1ZaeW+BJ+D090e4hIIZLBcTDWe4Mh4jvUDajzQ==", "dev": true, "license": "MIT", "dependencies": { - "@eslint-community/regexpp": "^4.12.2", - "@typescript-eslint/scope-manager": "8.56.0", - "@typescript-eslint/type-utils": "8.56.0", - "@typescript-eslint/utils": "8.56.0", - "@typescript-eslint/visitor-keys": "8.56.0", - "ignore": "^7.0.5", - "natural-compare": "^1.4.0", - "ts-api-utils": "^2.4.0" + "possible-typed-array-names": "^1.0.0" }, "engines": { - "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + "node": ">= 0.4" }, "funding": { - "type": "opencollective", - "url": "https://opencollective.com/typescript-eslint" - }, - "peerDependencies": { - "@typescript-eslint/parser": "^8.56.0", - "eslint": "^8.57.0 || ^9.0.0 || ^10.0.0", - "typescript": ">=4.8.4 <6.0.0" + "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/@typescript-eslint/eslint-plugin/node_modules/ignore": { - "version": "7.0.5", - "resolved": "https://registry.npmjs.org/ignore/-/ignore-7.0.5.tgz", - "integrity": "sha512-Hs59xBNfUIunMFgWAbGX5cq6893IbWg4KnrjbYwX3tx0ztorVgTDA6B2sxf8ejHJ4wz8BqGUMYlnzNBer5NvGg==", + "node_modules/axe-core": { + "version": "4.12.0", + "resolved": "https://registry.npmjs.org/axe-core/-/axe-core-4.12.0.tgz", + "integrity": "sha512-FTavr/7Ba0IptwGOPxnQvdyW2tAsdLBMTBXz7rKH6xJ2skpyxpBxyHkDdBs4lf69yRqYpkqCdfhnwS8YULGOmg==", "dev": true, - "license": "MIT", + "license": "MPL-2.0", "engines": { - "node": ">= 4" + "node": ">=4" } }, - "node_modules/@typescript-eslint/parser": { - "version": "8.56.0", - "resolved": "https://registry.npmjs.org/@typescript-eslint/parser/-/parser-8.56.0.tgz", - "integrity": "sha512-IgSWvLobTDOjnaxAfDTIHaECbkNlAlKv2j5SjpB2v7QHKv1FIfjwMy8FsDbVfDX/KjmCmYICcw7uGaXLhtsLNg==", + "node_modules/axobject-query": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/axobject-query/-/axobject-query-4.1.0.tgz", + "integrity": "sha512-qIj0G9wZbMGNLjLmg1PT6v2mE9AH2zlnADJD/2tC6E00hgmhUOfEB6greHPAfLRSufHqROIUTkw6E+M3lH0PTQ==", "dev": true, - "license": "MIT", - "dependencies": { - "@typescript-eslint/scope-manager": "8.56.0", - "@typescript-eslint/types": "8.56.0", - "@typescript-eslint/typescript-estree": "8.56.0", - "@typescript-eslint/visitor-keys": "8.56.0", - "debug": "^4.4.3" - }, + "license": "Apache-2.0", "engines": { - "node": "^18.18.0 || ^20.9.0 || >=21.1.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/typescript-eslint" - }, - "peerDependencies": { - "eslint": "^8.57.0 || ^9.0.0 || ^10.0.0", - "typescript": ">=4.8.4 <6.0.0" + "node": ">= 0.4" } }, - "node_modules/@typescript-eslint/project-service": { - "version": "8.56.0", - "resolved": "https://registry.npmjs.org/@typescript-eslint/project-service/-/project-service-8.56.0.tgz", - "integrity": "sha512-M3rnyL1vIQOMeWxTWIW096/TtVP+8W3p/XnaFflhmcFp+U4zlxUxWj4XwNs6HbDeTtN4yun0GNTTDBw/SvufKg==", + "node_modules/babel-plugin-polyfill-corejs2": { + "version": "0.4.17", + "resolved": "https://registry.npmjs.org/babel-plugin-polyfill-corejs2/-/babel-plugin-polyfill-corejs2-0.4.17.tgz", + "integrity": "sha512-aTyf30K/rqAsNwN76zYrdtx8obu0E4KoUME29B1xj+B3WxgvWkp943vYQ+z8Mv3lw9xHXMHpvSPOBxzAkIa94w==", "dev": true, "license": "MIT", "dependencies": { - "@typescript-eslint/tsconfig-utils": "^8.56.0", - "@typescript-eslint/types": "^8.56.0", - "debug": "^4.4.3" - }, - "engines": { - "node": "^18.18.0 || ^20.9.0 || >=21.1.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/typescript-eslint" + "@babel/compat-data": "^7.28.6", + "@babel/helper-define-polyfill-provider": "^0.6.8", + "semver": "^6.3.1" }, "peerDependencies": { - "typescript": ">=4.8.4 <6.0.0" + "@babel/core": "^7.4.0 || ^8.0.0-0 <8.0.0" } }, - "node_modules/@typescript-eslint/scope-manager": { - "version": "8.56.0", - "resolved": "https://registry.npmjs.org/@typescript-eslint/scope-manager/-/scope-manager-8.56.0.tgz", - "integrity": "sha512-7UiO/XwMHquH+ZzfVCfUNkIXlp/yQjjnlYUyYz7pfvlK3/EyyN6BK+emDmGNyQLBtLGaYrTAI6KOw8tFucWL2w==", + "node_modules/babel-plugin-polyfill-corejs2/node_modules/semver": { + "version": "6.3.1", + "resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz", + "integrity": "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA==", "dev": true, - "license": "MIT", - "dependencies": { - "@typescript-eslint/types": "8.56.0", - "@typescript-eslint/visitor-keys": "8.56.0" - }, - "engines": { - "node": "^18.18.0 || ^20.9.0 || >=21.1.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/typescript-eslint" + "license": "ISC", + "bin": { + "semver": "bin/semver.js" } }, - "node_modules/@typescript-eslint/tsconfig-utils": { - "version": "8.56.0", - "resolved": "https://registry.npmjs.org/@typescript-eslint/tsconfig-utils/-/tsconfig-utils-8.56.0.tgz", - "integrity": "sha512-bSJoIIt4o3lKXD3xmDh9chZcjCz5Lk8xS7Rxn+6l5/pKrDpkCwtQNQQwZ2qRPk7TkUYhrq3WPIHXOXlbXP0itg==", + "node_modules/babel-plugin-polyfill-corejs3": { + "version": "0.14.2", + "resolved": "https://registry.npmjs.org/babel-plugin-polyfill-corejs3/-/babel-plugin-polyfill-corejs3-0.14.2.tgz", + "integrity": "sha512-coWpDLJ410R781Npmn/SIBZEsAetR4xVi0SxLMXPaMO4lSf1MwnkGYMtkFxew0Dn8B3/CpbpYxN0JCgg8mn67g==", "dev": true, "license": "MIT", - "engines": { - "node": "^18.18.0 || ^20.9.0 || >=21.1.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/typescript-eslint" + "dependencies": { + "@babel/helper-define-polyfill-provider": "^0.6.8", + "core-js-compat": "^3.48.0" }, "peerDependencies": { - "typescript": ">=4.8.4 <6.0.0" + "@babel/core": "^7.4.0 || ^8.0.0-0 <8.0.0" } }, - "node_modules/@typescript-eslint/type-utils": { - "version": "8.56.0", - "resolved": "https://registry.npmjs.org/@typescript-eslint/type-utils/-/type-utils-8.56.0.tgz", - "integrity": "sha512-qX2L3HWOU2nuDs6GzglBeuFXviDODreS58tLY/BALPC7iu3Fa+J7EOTwnX9PdNBxUI7Uh0ntP0YWGnxCkXzmfA==", + "node_modules/babel-plugin-polyfill-regenerator": { + "version": "0.6.8", + "resolved": "https://registry.npmjs.org/babel-plugin-polyfill-regenerator/-/babel-plugin-polyfill-regenerator-0.6.8.tgz", + "integrity": "sha512-M762rNHfSF1EV3SLtnCJXFoQbbIIz0OyRwnCmV0KPC7qosSfCO0QLTSuJX3ayAebubhE6oYBAYPrBA5ljowaZg==", "dev": true, "license": "MIT", "dependencies": { - "@typescript-eslint/types": "8.56.0", - "@typescript-eslint/typescript-estree": "8.56.0", - "@typescript-eslint/utils": "8.56.0", - "debug": "^4.4.3", - "ts-api-utils": "^2.4.0" - }, - "engines": { - "node": "^18.18.0 || ^20.9.0 || >=21.1.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/typescript-eslint" + "@babel/helper-define-polyfill-provider": "^0.6.8" }, "peerDependencies": { - "eslint": "^8.57.0 || ^9.0.0 || ^10.0.0", - "typescript": ">=4.8.4 <6.0.0" + "@babel/core": "^7.4.0 || ^8.0.0-0 <8.0.0" } }, - "node_modules/@typescript-eslint/types": { - "version": "8.56.0", - "resolved": "https://registry.npmjs.org/@typescript-eslint/types/-/types-8.56.0.tgz", - "integrity": "sha512-DBsLPs3GsWhX5HylbP9HNG15U0bnwut55Lx12bHB9MpXxQ+R5GC8MwQe+N1UFXxAeQDvEsEDY6ZYwX03K7Z6HQ==", + "node_modules/bail": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/bail/-/bail-2.0.2.tgz", + "integrity": "sha512-0xO6mYd7JB2YesxDKplafRpsiOzPt9V02ddPCLbY1xYGPOX24NTyN50qnUxgCPcSoYMhKpAuBTjQoRZCAkUDRw==", "dev": true, "license": "MIT", - "engines": { - "node": "^18.18.0 || ^20.9.0 || >=21.1.0" - }, "funding": { - "type": "opencollective", - "url": "https://opencollective.com/typescript-eslint" + "type": "github", + "url": "https://github.com/sponsors/wooorm" } }, - "node_modules/@typescript-eslint/typescript-estree": { - "version": "8.56.0", - "resolved": "https://registry.npmjs.org/@typescript-eslint/typescript-estree/-/typescript-estree-8.56.0.tgz", - "integrity": "sha512-ex1nTUMWrseMltXUHmR2GAQ4d+WjkZCT4f+4bVsps8QEdh0vlBsaCokKTPlnqBFqqGaxilDNJG7b8dolW2m43Q==", + "node_modules/balanced-match": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.2.tgz", + "integrity": "sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==", "dev": true, - "license": "MIT", - "dependencies": { - "@typescript-eslint/project-service": "8.56.0", - "@typescript-eslint/tsconfig-utils": "8.56.0", - "@typescript-eslint/types": "8.56.0", - "@typescript-eslint/visitor-keys": "8.56.0", - "debug": "^4.4.3", - "minimatch": "^9.0.5", - "semver": "^7.7.3", - "tinyglobby": "^0.2.15", - "ts-api-utils": "^2.4.0" + "license": "MIT" + }, + "node_modules/baseline-browser-mapping": { + "version": "2.10.33", + "resolved": "https://registry.npmjs.org/baseline-browser-mapping/-/baseline-browser-mapping-2.10.33.tgz", + "integrity": "sha512-bA6+tcSLpz2tIEdDXZPpPTIuxBcC4+w6SieaYyfigIa4h8GlFxbA17v22Vx3JUtuZQj9SgOsnbK+aTBzyDyEuw==", + "dev": true, + "license": "Apache-2.0", + "bin": { + "baseline-browser-mapping": "dist/cli.cjs" }, "engines": { - "node": "^18.18.0 || ^20.9.0 || >=21.1.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/typescript-eslint" - }, - "peerDependencies": { - "typescript": ">=4.8.4 <6.0.0" + "node": ">=6.0.0" } }, - "node_modules/@typescript-eslint/typescript-estree/node_modules/brace-expansion": { - "version": "2.0.3", - "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.3.tgz", - "integrity": "sha512-MCV/fYJEbqx68aE58kv2cA/kiky1G8vux3OR6/jbS+jIMe/6fJWa0DTzJU7dqijOWYwHi1t29FlfYI9uytqlpA==", + "node_modules/basic-auth": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/basic-auth/-/basic-auth-2.0.1.tgz", + "integrity": "sha512-NF+epuEdnUYVlGuhaxbbq+dvJttwLnGY+YixlXlME5KpQ5W3CnXA5cVTneY3SPbPDRkcjMbifrwmFYcClgOZeg==", "dev": true, "license": "MIT", "dependencies": { - "balanced-match": "^1.0.0" - } - }, - "node_modules/@typescript-eslint/typescript-estree/node_modules/minimatch": { - "version": "9.0.9", - "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-9.0.9.tgz", - "integrity": "sha512-OBwBN9AL4dqmETlpS2zasx+vTeWclWzkblfZk7KTA5j3jeOONz/tRCnZomUyvNg83wL5Zv9Ss6HMJXAgL8R2Yg==", - "dev": true, - "license": "ISC", - "dependencies": { - "brace-expansion": "^2.0.2" + "safe-buffer": "5.1.2" }, "engines": { - "node": ">=16 || 14 >=14.17" - }, - "funding": { - "url": "https://github.com/sponsors/isaacs" + "node": ">= 0.8" } }, - "node_modules/@typescript-eslint/utils": { - "version": "8.56.0", - "resolved": "https://registry.npmjs.org/@typescript-eslint/utils/-/utils-8.56.0.tgz", - "integrity": "sha512-RZ3Qsmi2nFGsS+n+kjLAYDPVlrzf7UhTffrDIKr+h2yzAlYP/y5ZulU0yeDEPItos2Ph46JAL5P/On3pe7kDIQ==", + "node_modules/bits-ui": { + "version": "2.18.1", + "resolved": "https://registry.npmjs.org/bits-ui/-/bits-ui-2.18.1.tgz", + "integrity": "sha512-KkemzKFH4T3gt3H+P86JcnAWExjByv/6vlwjm/BoCwTPHu03yiCdxbghdJLvFReQTe0acCAiRcKfmixxD6XvlA==", "dev": true, "license": "MIT", "dependencies": { - "@eslint-community/eslint-utils": "^4.9.1", - "@typescript-eslint/scope-manager": "8.56.0", - "@typescript-eslint/types": "8.56.0", - "@typescript-eslint/typescript-estree": "8.56.0" + "@floating-ui/core": "^1.7.1", + "@floating-ui/dom": "^1.7.1", + "esm-env": "^1.1.2", + "runed": "^0.35.1", + "svelte-toolbelt": "^0.10.6", + "tabbable": "^6.2.0" }, "engines": { - "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + "node": ">=20" }, "funding": { - "type": "opencollective", - "url": "https://opencollective.com/typescript-eslint" + "url": "https://github.com/sponsors/huntabyte" }, "peerDependencies": { - "eslint": "^8.57.0 || ^9.0.0 || ^10.0.0", - "typescript": ">=4.8.4 <6.0.0" + "@internationalized/date": "^3.8.1", + "svelte": "^5.33.0" } }, - "node_modules/@typescript-eslint/visitor-keys": { - "version": "8.56.0", - "resolved": "https://registry.npmjs.org/@typescript-eslint/visitor-keys/-/visitor-keys-8.56.0.tgz", - "integrity": "sha512-q+SL+b+05Ud6LbEE35qe4A99P+htKTKVbyiNEe45eCbJFyh/HVK9QXwlrbz+Q4L8SOW4roxSVwXYj4DMBT7Ieg==", + "node_modules/body-parser": { + "version": "2.2.2", + "resolved": "https://registry.npmjs.org/body-parser/-/body-parser-2.2.2.tgz", + "integrity": "sha512-oP5VkATKlNwcgvxi0vM0p/D3n2C3EReYVX+DNYs5TjZFn/oQt2j+4sVJtSMr18pdRr8wjTcBl6LoV+FUwzPmNA==", "dev": true, "license": "MIT", "dependencies": { - "@typescript-eslint/types": "8.56.0", - "eslint-visitor-keys": "^5.0.0" + "bytes": "^3.1.2", + "content-type": "^1.0.5", + "debug": "^4.4.3", + "http-errors": "^2.0.0", + "iconv-lite": "^0.7.0", + "on-finished": "^2.4.1", + "qs": "^6.14.1", + "raw-body": "^3.0.1", + "type-is": "^2.0.1" }, "engines": { - "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + "node": ">=18" }, "funding": { "type": "opencollective", - "url": "https://opencollective.com/typescript-eslint" - } - }, - "node_modules/@typescript-eslint/visitor-keys/node_modules/eslint-visitor-keys": { - "version": "5.0.0", - "resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-5.0.0.tgz", - "integrity": "sha512-A0XeIi7CXU7nPlfHS9loMYEKxUaONu/hTEzHTGba9Huu94Cq1hPivf+DE5erJozZOky0LfvXAyrV/tcswpLI0Q==", - "dev": true, - "license": "Apache-2.0", - "engines": { - "node": "^20.19.0 || ^22.13.0 || >=24" - }, - "funding": { - "url": "https://opencollective.com/eslint" - } - }, - "node_modules/@ungap/structured-clone": { - "version": "1.3.0", - "resolved": "https://registry.npmjs.org/@ungap/structured-clone/-/structured-clone-1.3.0.tgz", - "integrity": "sha512-WmoN8qaIAo7WTYWbAZuG8PYEhn5fkz7dZrqTBZ7dtt//lL2Gwms1IcnQ5yHqjDfX8Ft5j4YzDM23f87zBfDe9g==", - "dev": true, - "license": "ISC" - }, - "node_modules/@upsetjs/venn.js": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/@upsetjs/venn.js/-/venn.js-2.0.0.tgz", - "integrity": "sha512-WbBhLrooyePuQ1VZxrJjtLvTc4NVfpOyKx0sKqioq9bX1C1m7Jgykkn8gLrtwumBioXIqam8DLxp88Adbue6Hw==", - "dev": true, - "license": "MIT", - "optionalDependencies": { - "d3-selection": "^3.0.0", - "d3-transition": "^3.0.1" + "url": "https://opencollective.com/express" } }, - "node_modules/@vitest/browser": { - "version": "4.1.8", - "resolved": "https://registry.npmjs.org/@vitest/browser/-/browser-4.1.8.tgz", - "integrity": "sha512-u21VzX07HzlJYpFgkxmjEXar/tG2UqWGgyGG/46SrrPc7rSdCTPw5vuowopO9CIqF8UCUQzDFdbVnNpw6N0BfQ==", + "node_modules/brace-expansion": { + "version": "1.1.15", + "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.15.tgz", + "integrity": "sha512-EwOCDEex4quD37XhqM3omwtMoJjr//isUZz1JopUNWms+4Z2ViyM/k1YIRePpoVNnQhENnxtFjLaxNHrT7xIUg==", "dev": true, "license": "MIT", "dependencies": { - "@blazediff/core": "1.9.1", - "@vitest/mocker": "4.1.8", - "@vitest/utils": "4.1.8", - "magic-string": "^0.30.21", - "pngjs": "^7.0.0", - "sirv": "^3.0.2", - "tinyrainbow": "^3.1.0", - "ws": "^8.19.0" - }, - "funding": { - "url": "https://opencollective.com/vitest" - }, - "peerDependencies": { - "vitest": "4.1.8" + "balanced-match": "^1.0.0", + "concat-map": "0.0.1" } }, - "node_modules/@vitest/browser-playwright": { - "version": "4.1.8", - "resolved": "https://registry.npmjs.org/@vitest/browser-playwright/-/browser-playwright-4.1.8.tgz", - "integrity": "sha512-SR7FqgegaexEg73xvf3ArtygXegagMdXnL0EZMpxrWvvhQxvicD/E8p0ib0J91riPRtQUViyh67Xjw3NqvyhVg==", + "node_modules/browserslist": { + "version": "4.28.2", + "resolved": "https://registry.npmjs.org/browserslist/-/browserslist-4.28.2.tgz", + "integrity": "sha512-48xSriZYYg+8qXna9kwqjIVzuQxi+KYWp2+5nCYnYKPTr0LvD89Jqk2Or5ogxz0NUMfIjhh2lIUX/LyX9B4oIg==", "dev": true, + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/browserslist" + }, + { + "type": "tidelift", + "url": "https://tidelift.com/funding/github/npm/browserslist" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], "license": "MIT", "dependencies": { - "@vitest/browser": "4.1.8", - "@vitest/mocker": "4.1.8", - "tinyrainbow": "^3.1.0" - }, - "funding": { - "url": "https://opencollective.com/vitest" + "baseline-browser-mapping": "^2.10.12", + "caniuse-lite": "^1.0.30001782", + "electron-to-chromium": "^1.5.328", + "node-releases": "^2.0.36", + "update-browserslist-db": "^1.2.3" }, - "peerDependencies": { - "playwright": "*", - "vitest": "4.1.8" + "bin": { + "browserslist": "cli.js" }, - "peerDependenciesMeta": { - "playwright": { - "optional": false - } + "engines": { + "node": "^6 || ^7 || ^8 || ^9 || ^10 || ^11 || ^12 || >=13.7" } }, - "node_modules/@vitest/browser-playwright/node_modules/tinyrainbow": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/tinyrainbow/-/tinyrainbow-3.1.0.tgz", - "integrity": "sha512-Bf+ILmBgretUrdJxzXM0SgXLZ3XfiaUuOj/IKQHuTXip+05Xn+uyEYdVg0kYDipTBcLrCVyUzAPz7QmArb0mmw==", + "node_modules/buffer-from": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/buffer-from/-/buffer-from-1.1.2.tgz", + "integrity": "sha512-E+XQCRwSbaaiChtv6k6Dwgc+bx+Bs6vuKJHHl5kox/BaKbhiXzqQOwK4cO22yElGp2OCmjwVhT3HmxgyPGnJfQ==", "dev": true, - "license": "MIT", - "engines": { - "node": ">=14.0.0" - } + "license": "MIT" }, - "node_modules/@vitest/browser/node_modules/@vitest/pretty-format": { - "version": "4.1.8", - "resolved": "https://registry.npmjs.org/@vitest/pretty-format/-/pretty-format-4.1.8.tgz", - "integrity": "sha512-9GasEBxpZ1VYIpqHf/0+YGg121uSNwCKOJqIrTwWP/TB7DmFCiaBpNl3aPZzoLWfWkuqhbH8vJIVobZkvdo2cA==", + "node_modules/bundle-name": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/bundle-name/-/bundle-name-4.1.0.tgz", + "integrity": "sha512-tjwM5exMg6BGRI+kNmTntNsvdZS1X8BFYS6tnJ2hdH0kVxM6/eVZ2xy+FqStSWvYmtfFMDLIxurorHwDKfDz5Q==", "dev": true, "license": "MIT", "dependencies": { - "tinyrainbow": "^3.1.0" + "run-applescript": "^7.0.0" + }, + "engines": { + "node": ">=18" }, "funding": { - "url": "https://opencollective.com/vitest" + "url": "https://github.com/sponsors/sindresorhus" } }, - "node_modules/@vitest/browser/node_modules/@vitest/utils": { - "version": "4.1.8", - "resolved": "https://registry.npmjs.org/@vitest/utils/-/utils-4.1.8.tgz", - "integrity": "sha512-uOJamYALNhfJ6iolExyQM40yIQwDqYnkKtQ5VCiSe17E33H0aQ/u+1GlRuz4LZBk6Mm3sg90G9hEbmEt37C1Zg==", + "node_modules/bytes": { + "version": "3.1.2", + "resolved": "https://registry.npmjs.org/bytes/-/bytes-3.1.2.tgz", + "integrity": "sha512-/Nf7TyzTx6S3yRJObOAV7956r8cr2+Oj8AC5dt8wSP3BQAoeX58NoHyCU8P8zGkNXStjTSi6fzO6F0pBdcYbEg==", "dev": true, "license": "MIT", - "dependencies": { - "@vitest/pretty-format": "4.1.8", - "convert-source-map": "^2.0.0", - "tinyrainbow": "^3.1.0" - }, - "funding": { - "url": "https://opencollective.com/vitest" + "engines": { + "node": ">= 0.8" } }, - "node_modules/@vitest/browser/node_modules/tinyrainbow": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/tinyrainbow/-/tinyrainbow-3.1.0.tgz", - "integrity": "sha512-Bf+ILmBgretUrdJxzXM0SgXLZ3XfiaUuOj/IKQHuTXip+05Xn+uyEYdVg0kYDipTBcLrCVyUzAPz7QmArb0mmw==", + "node_modules/cac": { + "version": "6.7.14", + "resolved": "https://registry.npmjs.org/cac/-/cac-6.7.14.tgz", + "integrity": "sha512-b6Ilus+c3RrdDk+JhLKUAQfzzgLEPy6wcXqS7f/xe1EETvsDP6GORG7SFuOs6cID5YkqchW/LXZbX5bc8j7ZcQ==", "dev": true, "license": "MIT", "engines": { - "node": ">=14.0.0" + "node": ">=8" } }, - "node_modules/@vitest/coverage-v8": { - "version": "4.1.8", - "resolved": "https://registry.npmjs.org/@vitest/coverage-v8/-/coverage-v8-4.1.8.tgz", - "integrity": "sha512-lt3kovsyHwYe00wq4D1ti0Z974fWj4NLp6siqiyEufUpyFwK9Yhi7rBhac9JL5aA0zoMrJqc4vYPZRUnI7l7nw==", + "node_modules/call-bind": { + "version": "1.0.9", + "resolved": "https://registry.npmjs.org/call-bind/-/call-bind-1.0.9.tgz", + "integrity": "sha512-a/hy+pNsFUTR+Iz8TCJvXudKVLAnz/DyeSUo10I5yvFDQJBFU2s9uqQpoSrJlroHUKoKqzg+epxyP9lqFdzfBQ==", "dev": true, "license": "MIT", "dependencies": { - "@bcoe/v8-coverage": "^1.0.2", - "@vitest/utils": "4.1.8", - "ast-v8-to-istanbul": "^1.0.0", - "istanbul-lib-coverage": "^3.2.2", - "istanbul-lib-report": "^3.0.1", - "istanbul-reports": "^3.2.0", - "magicast": "^0.5.2", - "obug": "^2.1.1", - "std-env": "^4.0.0-rc.1", - "tinyrainbow": "^3.1.0" - }, - "funding": { - "url": "https://opencollective.com/vitest" + "call-bind-apply-helpers": "^1.0.2", + "es-define-property": "^1.0.1", + "get-intrinsic": "^1.3.0", + "set-function-length": "^1.2.2" }, - "peerDependencies": { - "@vitest/browser": "4.1.8", - "vitest": "4.1.8" + "engines": { + "node": ">= 0.4" }, - "peerDependenciesMeta": { - "@vitest/browser": { - "optional": true - } + "funding": { + "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/@vitest/coverage-v8/node_modules/@vitest/pretty-format": { - "version": "4.1.8", - "resolved": "https://registry.npmjs.org/@vitest/pretty-format/-/pretty-format-4.1.8.tgz", - "integrity": "sha512-9GasEBxpZ1VYIpqHf/0+YGg121uSNwCKOJqIrTwWP/TB7DmFCiaBpNl3aPZzoLWfWkuqhbH8vJIVobZkvdo2cA==", + "node_modules/call-bind-apply-helpers": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/call-bind-apply-helpers/-/call-bind-apply-helpers-1.0.2.tgz", + "integrity": "sha512-Sp1ablJ0ivDkSzjcaJdxEunN5/XvksFJ2sMBFfq6x0ryhQV/2b/KwFe21cMpmHtPOSij8K99/wSfoEuTObmuMQ==", "dev": true, "license": "MIT", "dependencies": { - "tinyrainbow": "^3.1.0" + "es-errors": "^1.3.0", + "function-bind": "^1.1.2" }, - "funding": { - "url": "https://opencollective.com/vitest" + "engines": { + "node": ">= 0.4" } }, - "node_modules/@vitest/coverage-v8/node_modules/@vitest/utils": { - "version": "4.1.8", - "resolved": "https://registry.npmjs.org/@vitest/utils/-/utils-4.1.8.tgz", - "integrity": "sha512-uOJamYALNhfJ6iolExyQM40yIQwDqYnkKtQ5VCiSe17E33H0aQ/u+1GlRuz4LZBk6Mm3sg90G9hEbmEt37C1Zg==", + "node_modules/call-bound": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/call-bound/-/call-bound-1.0.4.tgz", + "integrity": "sha512-+ys997U96po4Kx/ABpBCqhA9EuxJaQWDQg7295H4hBphv3IZg0boBKuwYpt4YXp6MZ5AmZQnU/tyMTlRpaSejg==", "dev": true, "license": "MIT", "dependencies": { - "@vitest/pretty-format": "4.1.8", - "convert-source-map": "^2.0.0", - "tinyrainbow": "^3.1.0" + "call-bind-apply-helpers": "^1.0.2", + "get-intrinsic": "^1.3.0" + }, + "engines": { + "node": ">= 0.4" }, "funding": { - "url": "https://opencollective.com/vitest" + "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/@vitest/coverage-v8/node_modules/tinyrainbow": { + "node_modules/callsites": { "version": "3.1.0", - "resolved": "https://registry.npmjs.org/tinyrainbow/-/tinyrainbow-3.1.0.tgz", - "integrity": "sha512-Bf+ILmBgretUrdJxzXM0SgXLZ3XfiaUuOj/IKQHuTXip+05Xn+uyEYdVg0kYDipTBcLrCVyUzAPz7QmArb0mmw==", + "resolved": "https://registry.npmjs.org/callsites/-/callsites-3.1.0.tgz", + "integrity": "sha512-P8BjAsXvZS+VIDUI11hHCQEv74YT67YUi5JJFNWIqL235sBmjX4+qx9Muvls5ivyNENctx46xQLQ3aTuE7ssaQ==", "dev": true, "license": "MIT", "engines": { - "node": ">=14.0.0" + "node": ">=6" } }, - "node_modules/@vitest/expect": { - "version": "3.2.4", - "resolved": "https://registry.npmjs.org/@vitest/expect/-/expect-3.2.4.tgz", - "integrity": "sha512-Io0yyORnB6sikFlt8QW5K7slY4OjqNX9jmJQ02QDda8lyM6B5oNgVWoSoKPac8/kgnCUzuHQKrSLtu/uOqqrig==", + "node_modules/caniuse-lite": { + "version": "1.0.30001793", + "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001793.tgz", + "integrity": "sha512-iwSsYWaCOoh26cV8NwNRViHlrfUvYsHDfRVcbtmw0Kg6PJIZZXwMkj1442FYLBGkeUf1juAsU3DTfxW579mrPA==", + "dev": true, + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/browserslist" + }, + { + "type": "tidelift", + "url": "https://tidelift.com/funding/github/npm/caniuse-lite" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "CC-BY-4.0" + }, + "node_modules/ccount": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/ccount/-/ccount-2.0.1.tgz", + "integrity": "sha512-eyrF0jiFpY+3drT6383f1qhkbGsLSifNAjA61IUjZjmLCWjItY6LB9ft9YhoDgwfmclB2zhu51Lc7+95b8NRAg==", "dev": true, "license": "MIT", - "dependencies": { - "@types/chai": "^5.2.2", - "@vitest/spy": "3.2.4", - "@vitest/utils": "3.2.4", - "chai": "^5.2.0", - "tinyrainbow": "^2.0.0" - }, "funding": { - "url": "https://opencollective.com/vitest" + "type": "github", + "url": "https://github.com/sponsors/wooorm" } }, - "node_modules/@vitest/mocker": { - "version": "4.1.8", - "resolved": "https://registry.npmjs.org/@vitest/mocker/-/mocker-4.1.8.tgz", - "integrity": "sha512-LEiN/xe4OSIbKe9HQIp5OC24agGD9J5CnmMgsLohVVoOPWL9a2sBoR6VBx43jQZb7Kr1l4RCuyCJzcAa0+dojw==", + "node_modules/chai": { + "version": "5.3.3", + "resolved": "https://registry.npmjs.org/chai/-/chai-5.3.3.tgz", + "integrity": "sha512-4zNhdJD/iOjSH0A05ea+Ke6MU5mmpQcbQsSOkgdaUMJ9zTlDTD/GYlwohmIE2u0gaxHYiVHEn1Fw9mZ/ktJWgw==", "dev": true, "license": "MIT", "dependencies": { - "@vitest/spy": "4.1.8", - "estree-walker": "^3.0.3", - "magic-string": "^0.30.21" + "assertion-error": "^2.0.1", + "check-error": "^2.1.1", + "deep-eql": "^5.0.1", + "loupe": "^3.1.0", + "pathval": "^2.0.0" }, - "funding": { - "url": "https://opencollective.com/vitest" + "engines": { + "node": ">=18" + } + }, + "node_modules/chalk": { + "version": "4.1.2", + "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz", + "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==", + "dev": true, + "license": "MIT", + "dependencies": { + "ansi-styles": "^4.1.0", + "supports-color": "^7.1.0" }, - "peerDependencies": { - "msw": "^2.4.9", - "vite": "^6.0.0 || ^7.0.0 || ^8.0.0" + "engines": { + "node": ">=10" }, - "peerDependenciesMeta": { - "msw": { - "optional": true - }, - "vite": { - "optional": true - } + "funding": { + "url": "https://github.com/chalk/chalk?sponsor=1" } }, - "node_modules/@vitest/mocker/node_modules/@vitest/spy": { - "version": "4.1.8", - "resolved": "https://registry.npmjs.org/@vitest/spy/-/spy-4.1.8.tgz", - "integrity": "sha512-6EevtBp6OZOPF7bmz36HrGMeP3txgVSrgebWxHOafDXGkhIzfXK14f8KF6MuFfgXXUeHxmpD3BQxkV00/3s5mA==", + "node_modules/character-entities": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/character-entities/-/character-entities-2.0.2.tgz", + "integrity": "sha512-shx7oQ0Awen/BRIdkjkvz54PnEEI/EjwXDSIZp86/KKdbafHh1Df/RYGBhn4hbe2+uKC9FnT5UCEdyPz3ai9hQ==", "dev": true, "license": "MIT", "funding": { - "url": "https://opencollective.com/vitest" + "type": "github", + "url": "https://github.com/sponsors/wooorm" } }, - "node_modules/@vitest/pretty-format": { - "version": "3.2.4", - "resolved": "https://registry.npmjs.org/@vitest/pretty-format/-/pretty-format-3.2.4.tgz", - "integrity": "sha512-IVNZik8IVRJRTr9fxlitMKeJeXFFFN0JaB9PHPGQ8NKQbGpfjlTx9zO4RefN8gp7eqjNy8nyK3NZmBzOPeIxtA==", + "node_modules/character-entities-html4": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/character-entities-html4/-/character-entities-html4-2.1.0.tgz", + "integrity": "sha512-1v7fgQRj6hnSwFpq1Eu0ynr/CDEw0rXo2B61qXrLNdHZmPKgb7fqS1a2JwF0rISo9q77jDI8VMEHoApn8qDoZA==", "dev": true, "license": "MIT", - "dependencies": { - "tinyrainbow": "^2.0.0" - }, "funding": { - "url": "https://opencollective.com/vitest" + "type": "github", + "url": "https://github.com/sponsors/wooorm" } }, - "node_modules/@vitest/runner": { - "version": "4.1.8", - "resolved": "https://registry.npmjs.org/@vitest/runner/-/runner-4.1.8.tgz", - "integrity": "sha512-EmVxeBAfMJvycdjd6Hm+RbFBbA9fKvo0Kx37hNpBYoYeavH3RNsBXWDooR1mgD52dCrxIIuP7UotpfiwOikvcg==", + "node_modules/character-entities-legacy": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/character-entities-legacy/-/character-entities-legacy-3.0.0.tgz", + "integrity": "sha512-RpPp0asT/6ufRm//AJVwpViZbGM/MkjQFxJccQRHmISF/22NBtsHqAWmL+/pmkPWoIUJdWyeVleTl1wydHATVQ==", "dev": true, "license": "MIT", - "dependencies": { - "@vitest/utils": "4.1.8", - "pathe": "^2.0.3" - }, "funding": { - "url": "https://opencollective.com/vitest" + "type": "github", + "url": "https://github.com/sponsors/wooorm" } }, - "node_modules/@vitest/runner/node_modules/@vitest/pretty-format": { - "version": "4.1.8", - "resolved": "https://registry.npmjs.org/@vitest/pretty-format/-/pretty-format-4.1.8.tgz", - "integrity": "sha512-9GasEBxpZ1VYIpqHf/0+YGg121uSNwCKOJqIrTwWP/TB7DmFCiaBpNl3aPZzoLWfWkuqhbH8vJIVobZkvdo2cA==", + "node_modules/check-error": { + "version": "2.1.3", + "resolved": "https://registry.npmjs.org/check-error/-/check-error-2.1.3.tgz", + "integrity": "sha512-PAJdDJusoxnwm1VwW07VWwUN1sl7smmC3OKggvndJFadxxDRyFJBX/ggnu/KE4kQAB7a3Dp8f/YXC1FlUprWmA==", "dev": true, "license": "MIT", - "dependencies": { - "tinyrainbow": "^3.1.0" - }, - "funding": { - "url": "https://opencollective.com/vitest" + "engines": { + "node": ">= 16" } }, - "node_modules/@vitest/runner/node_modules/@vitest/utils": { - "version": "4.1.8", - "resolved": "https://registry.npmjs.org/@vitest/utils/-/utils-4.1.8.tgz", - "integrity": "sha512-uOJamYALNhfJ6iolExyQM40yIQwDqYnkKtQ5VCiSe17E33H0aQ/u+1GlRuz4LZBk6Mm3sg90G9hEbmEt37C1Zg==", + "node_modules/chokidar": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/chokidar/-/chokidar-5.0.0.tgz", + "integrity": "sha512-TQMmc3w+5AxjpL8iIiwebF73dRDF4fBIieAqGn9RGCWaEVwQ6Fb2cGe31Yns0RRIzii5goJ1Y7xbMwo1TxMplw==", "dev": true, "license": "MIT", - "dependencies": { - "@vitest/pretty-format": "4.1.8", - "convert-source-map": "^2.0.0", - "tinyrainbow": "^3.1.0" + "dependencies": { + "readdirp": "^5.0.0" + }, + "engines": { + "node": ">= 20.19.0" }, "funding": { - "url": "https://opencollective.com/vitest" + "url": "https://paulmillr.com/funding/" } }, - "node_modules/@vitest/runner/node_modules/tinyrainbow": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/tinyrainbow/-/tinyrainbow-3.1.0.tgz", - "integrity": "sha512-Bf+ILmBgretUrdJxzXM0SgXLZ3XfiaUuOj/IKQHuTXip+05Xn+uyEYdVg0kYDipTBcLrCVyUzAPz7QmArb0mmw==", + "node_modules/chownr": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/chownr/-/chownr-3.0.0.tgz", + "integrity": "sha512-+IxzY9BZOQd/XuYPRmrvEVjF/nqj5kgT4kEq7VofrDoM1MxoRjEWkrCC3EtLi59TVawxTAn+orJwFQcrqEN1+g==", "dev": true, - "license": "MIT", + "license": "BlueOak-1.0.0", "engines": { - "node": ">=14.0.0" + "node": ">=18" } }, - "node_modules/@vitest/snapshot": { - "version": "4.1.8", - "resolved": "https://registry.npmjs.org/@vitest/snapshot/-/snapshot-4.1.8.tgz", - "integrity": "sha512-acfZboRmAIf05DEKcBQy33VXojFJjtUdLyo7oOmV9kebb2xdU01UknNiPuPZoJZQyO7DF0gZdTGTpeAzET9QPQ==", + "node_modules/chromatic": { + "version": "13.3.5", + "resolved": "https://registry.npmjs.org/chromatic/-/chromatic-13.3.5.tgz", + "integrity": "sha512-MzPhxpl838qJUo0A55osCF2ifwPbjcIPeElr1d4SHcjnHoIcg7l1syJDrAYK/a+PcCBrOGi06jPNpQAln5hWgw==", "dev": true, "license": "MIT", - "dependencies": { - "@vitest/pretty-format": "4.1.8", - "@vitest/utils": "4.1.8", - "magic-string": "^0.30.21", - "pathe": "^2.0.3" + "bin": { + "chroma": "dist/bin.js", + "chromatic": "dist/bin.js", + "chromatic-cli": "dist/bin.js" }, - "funding": { - "url": "https://opencollective.com/vitest" + "peerDependencies": { + "@chromatic-com/cypress": "^0.*.* || ^1.0.0", + "@chromatic-com/playwright": "^0.*.* || ^1.0.0" + }, + "peerDependenciesMeta": { + "@chromatic-com/cypress": { + "optional": true + }, + "@chromatic-com/playwright": { + "optional": true + } } }, - "node_modules/@vitest/snapshot/node_modules/@vitest/pretty-format": { - "version": "4.1.8", - "resolved": "https://registry.npmjs.org/@vitest/pretty-format/-/pretty-format-4.1.8.tgz", - "integrity": "sha512-9GasEBxpZ1VYIpqHf/0+YGg121uSNwCKOJqIrTwWP/TB7DmFCiaBpNl3aPZzoLWfWkuqhbH8vJIVobZkvdo2cA==", + "node_modules/clsx": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/clsx/-/clsx-2.1.1.tgz", + "integrity": "sha512-eYm0QWBtUrBWZWG0d386OGAw16Z995PiOVo2B7bjWSbHedGl5e0ZWaq65kOGgUSNesEIDkB9ISbTg/JK9dhCZA==", "dev": true, "license": "MIT", - "dependencies": { - "tinyrainbow": "^3.1.0" - }, - "funding": { - "url": "https://opencollective.com/vitest" + "engines": { + "node": ">=6" } }, - "node_modules/@vitest/snapshot/node_modules/@vitest/utils": { - "version": "4.1.8", - "resolved": "https://registry.npmjs.org/@vitest/utils/-/utils-4.1.8.tgz", - "integrity": "sha512-uOJamYALNhfJ6iolExyQM40yIQwDqYnkKtQ5VCiSe17E33H0aQ/u+1GlRuz4LZBk6Mm3sg90G9hEbmEt37C1Zg==", + "node_modules/color": { + "version": "4.2.3", + "resolved": "https://registry.npmjs.org/color/-/color-4.2.3.tgz", + "integrity": "sha512-1rXeuUUiGGrykh+CeBdu5Ie7OJwinCgQY0bc7GCRxy5xVHy+moaqkpL/jqQq0MtQOeYcrqEz4abc5f0KtU7W4A==", "dev": true, "license": "MIT", "dependencies": { - "@vitest/pretty-format": "4.1.8", - "convert-source-map": "^2.0.0", - "tinyrainbow": "^3.1.0" + "color-convert": "^2.0.1", + "color-string": "^1.9.0" }, - "funding": { - "url": "https://opencollective.com/vitest" + "engines": { + "node": ">=12.5.0" } }, - "node_modules/@vitest/snapshot/node_modules/tinyrainbow": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/tinyrainbow/-/tinyrainbow-3.1.0.tgz", - "integrity": "sha512-Bf+ILmBgretUrdJxzXM0SgXLZ3XfiaUuOj/IKQHuTXip+05Xn+uyEYdVg0kYDipTBcLrCVyUzAPz7QmArb0mmw==", + "node_modules/color-convert": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz", + "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==", "dev": true, "license": "MIT", + "dependencies": { + "color-name": "~1.1.4" + }, "engines": { - "node": ">=14.0.0" + "node": ">=7.0.0" } }, - "node_modules/@vitest/spy": { - "version": "3.2.4", - "resolved": "https://registry.npmjs.org/@vitest/spy/-/spy-3.2.4.tgz", - "integrity": "sha512-vAfasCOe6AIK70iP5UD11Ac4siNUNJ9i/9PZ3NKx07sG6sUxeag1LWdNrMWeKKYBLlzuK+Gn65Yd5nyL6ds+nw==", + "node_modules/color-name": { + "version": "1.1.4", + "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz", + "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==", + "dev": true, + "license": "MIT" + }, + "node_modules/color-string": { + "version": "1.9.1", + "resolved": "https://registry.npmjs.org/color-string/-/color-string-1.9.1.tgz", + "integrity": "sha512-shrVawQFojnZv6xM40anx4CkoDP+fZsw/ZerEMsW/pyzsRbElpsL/DBVW7q3ExxwusdNXI3lXpuhEZkzs8p5Eg==", "dev": true, "license": "MIT", "dependencies": { - "tinyspy": "^4.0.3" - }, - "funding": { - "url": "https://opencollective.com/vitest" + "color-name": "^1.0.0", + "simple-swizzle": "^0.2.2" } }, - "node_modules/@vitest/utils": { - "version": "3.2.4", - "resolved": "https://registry.npmjs.org/@vitest/utils/-/utils-3.2.4.tgz", - "integrity": "sha512-fB2V0JFrQSMsCo9HiSq3Ezpdv4iYaXRG1Sx8edX3MwxfyNn83mKiGzOcH+Fkxt4MHxr3y42fQi1oeAInqgX2QA==", + "node_modules/colorette": { + "version": "2.0.20", + "resolved": "https://registry.npmjs.org/colorette/-/colorette-2.0.20.tgz", + "integrity": "sha512-IfEDxwoWIjkeXL1eXcDiow4UbKjhLdq6/EuSVR9GMN7KVH3r9gQ83e73hsz1Nd1T3ijd5xv1wcWRYO+D6kCI2w==", + "dev": true, + "license": "MIT" + }, + "node_modules/comma-separated-tokens": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/comma-separated-tokens/-/comma-separated-tokens-2.0.3.tgz", + "integrity": "sha512-Fu4hJdvzeylCfQPp9SGWidpzrMs7tTrlu6Vb8XGaRGck8QSNZJJp538Wrb60Lax4fPwR64ViY468OIUTbRlGZg==", "dev": true, "license": "MIT", - "dependencies": { - "@vitest/pretty-format": "3.2.4", - "loupe": "^3.1.4", - "tinyrainbow": "^2.0.0" - }, "funding": { - "url": "https://opencollective.com/vitest" + "type": "github", + "url": "https://github.com/sponsors/wooorm" } }, - "node_modules/accepts": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/accepts/-/accepts-2.0.0.tgz", - "integrity": "sha512-5cvg6CtKwfgdmVqY1WIiXKc3Q1bkRqGLi+2W/6ao+6Y7gu/RCwRuAhGEzh5B4KlszSuTLgZYuqFqo5bImjNKng==", + "node_modules/commander": { + "version": "7.2.0", + "resolved": "https://registry.npmjs.org/commander/-/commander-7.2.0.tgz", + "integrity": "sha512-QrWXB+ZQSVPmIWIhtEO9H+gwHaMGYiF5ChvoJ+K9ZGHG/sVsa6yiesAD1GC/x46sET00Xlwo1u49RVVVzvcSkw==", "dev": true, "license": "MIT", - "dependencies": { - "mime-types": "^3.0.0", - "negotiator": "^1.0.0" - }, "engines": { - "node": ">= 0.6" + "node": ">= 10" } }, - "node_modules/acorn": { - "version": "8.15.0", - "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.15.0.tgz", - "integrity": "sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg==", + "node_modules/common-tags": { + "version": "1.8.2", + "resolved": "https://registry.npmjs.org/common-tags/-/common-tags-1.8.2.tgz", + "integrity": "sha512-gk/Z852D2Wtb//0I+kRFNKKE9dIIVirjoqPoA1wJU+XePVXZfGeBpk45+A1rKO4Q43prqWBNY/MiIeRLbPWUaA==", "dev": true, "license": "MIT", - "bin": { - "acorn": "bin/acorn" - }, "engines": { - "node": ">=0.4.0" + "node": ">=4.0.0" } }, - "node_modules/acorn-jsx": { - "version": "5.3.2", - "resolved": "https://registry.npmjs.org/acorn-jsx/-/acorn-jsx-5.3.2.tgz", - "integrity": "sha512-rq9s+JNhf0IChjtDXxllJ7g41oZk5SlXtp0LHwyA5cejwn7vKmKp4pPri6YEePv2PU65sAsegbXtIinmDFDXgQ==", + "node_modules/concat-map": { + "version": "0.0.1", + "resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz", + "integrity": "sha512-/Srv4dswyQNBfohGpz9o6Yb3Gz3SrUDqBH5rTuhGR7ahtlbYKnVxw2bCFMRljaA7EXHaXZ8wsHdodFvbkhKmqg==", + "dev": true, + "license": "MIT" + }, + "node_modules/consola": { + "version": "3.4.2", + "resolved": "https://registry.npmjs.org/consola/-/consola-3.4.2.tgz", + "integrity": "sha512-5IKcdX0nnYavi6G7TtOhwkYzyjfJlatbjMjuLSfE2kYT5pMDOilZ4OvMhi637CcDICTmz3wARPoyhqyX1Y+XvA==", "dev": true, "license": "MIT", - "peerDependencies": { - "acorn": "^6.0.0 || ^7.0.0 || ^8.0.0" + "engines": { + "node": "^14.18.0 || >=16.10.0" } }, - "node_modules/ajv": { - "version": "6.14.0", - "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.14.0.tgz", - "integrity": "sha512-IWrosm/yrn43eiKqkfkHis7QioDleaXQHdDVPKg0FSwwd/DuvyX79TZnFOnYpB7dcsFAMmtFztZuXPDvSePkFw==", + "node_modules/content-disposition": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/content-disposition/-/content-disposition-1.1.0.tgz", + "integrity": "sha512-5jRCH9Z/+DRP7rkvY83B+yGIGX96OYdJmzngqnw2SBSxqCFPd0w2km3s5iawpGX8krnwSGmF0FW5Nhr0Hfai3g==", "dev": true, "license": "MIT", - "dependencies": { - "fast-deep-equal": "^3.1.1", - "fast-json-stable-stringify": "^2.0.0", - "json-schema-traverse": "^0.4.1", - "uri-js": "^4.2.2" + "engines": { + "node": ">=18" }, "funding": { - "type": "github", - "url": "https://github.com/sponsors/epoberezkin" + "type": "opencollective", + "url": "https://opencollective.com/express" } }, - "node_modules/ajv-formats": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/ajv-formats/-/ajv-formats-3.0.1.tgz", - "integrity": "sha512-8iUql50EUR+uUcdRQ3HDqa6EVyo3docL8g5WJ3FNcWmu62IbkGUue/pEyLBW8VGKKucTPgqeks4fIU1DA4yowQ==", + "node_modules/content-type": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/content-type/-/content-type-1.0.5.tgz", + "integrity": "sha512-nTjqfcBFEipKdXCv4YDQWCfmcLZKm81ldF0pAopTvyrFGVbcR6P/VAAd5G7N+0tTr8QqiU0tFadD6FK4NtJwOA==", "dev": true, "license": "MIT", - "dependencies": { - "ajv": "^8.0.0" - }, - "peerDependencies": { - "ajv": "^8.0.0" - }, - "peerDependenciesMeta": { - "ajv": { - "optional": true - } + "engines": { + "node": ">= 0.6" } }, - "node_modules/ajv-formats/node_modules/ajv": { - "version": "8.18.0", - "resolved": "https://registry.npmjs.org/ajv/-/ajv-8.18.0.tgz", - "integrity": "sha512-PlXPeEWMXMZ7sPYOHqmDyCJzcfNrUr3fGNKtezX14ykXOEIvyK81d+qydx89KY5O71FKMPaQ2vBfBFI5NHR63A==", + "node_modules/convert-source-map": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/convert-source-map/-/convert-source-map-2.0.0.tgz", + "integrity": "sha512-Kvp459HrV2FEJ1CAsi1Ku+MY3kasH19TFykTz2xWmMeq6bk2NU3XXvfJ+Q61m0xktWwt+1HSYf3JZsTms3aRJg==", + "dev": true, + "license": "MIT" + }, + "node_modules/cookie": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/cookie/-/cookie-1.1.1.tgz", + "integrity": "sha512-ei8Aos7ja0weRpFzJnEA9UHJ/7XQmqglbRwnf2ATjcB9Wq874VKH9kfjjirM6UhU2/E5fFYadylyhFldcqSidQ==", "dev": true, "license": "MIT", - "dependencies": { - "fast-deep-equal": "^3.1.3", - "fast-uri": "^3.0.1", - "json-schema-traverse": "^1.0.0", - "require-from-string": "^2.0.2" + "engines": { + "node": ">=18" }, "funding": { - "type": "github", - "url": "https://github.com/sponsors/epoberezkin" + "type": "opencollective", + "url": "https://opencollective.com/express" } }, - "node_modules/ajv-formats/node_modules/json-schema-traverse": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz", - "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==", - "dev": true, - "license": "MIT" - }, - "node_modules/ansi-regex": { - "version": "5.0.1", - "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz", - "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==", + "node_modules/cookie-signature": { + "version": "1.2.2", + "resolved": "https://registry.npmjs.org/cookie-signature/-/cookie-signature-1.2.2.tgz", + "integrity": "sha512-D76uU73ulSXrD1UXF4KE2TMxVVwhsnCgfAyTg9k8P6KGZjlXKrOLe4dJQKI3Bxi5wjesZoFXJWElNWBjPZMbhg==", "dev": true, "license": "MIT", - "peer": true, "engines": { - "node": ">=8" + "node": ">=6.6.0" } }, - "node_modules/ansi-styles": { - "version": "4.3.0", - "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz", - "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==", + "node_modules/core-js-compat": { + "version": "3.49.0", + "resolved": "https://registry.npmjs.org/core-js-compat/-/core-js-compat-3.49.0.tgz", + "integrity": "sha512-VQXt1jr9cBz03b331DFDCCP90b3fanciLkgiOoy8SBHy06gNf+vQ1A3WFLqG7I8TipYIKeYK9wxd0tUrvHcOZA==", "dev": true, "license": "MIT", "dependencies": { - "color-convert": "^2.0.1" - }, - "engines": { - "node": ">=8" + "browserslist": "^4.28.1" }, "funding": { - "url": "https://github.com/chalk/ansi-styles?sponsor=1" + "type": "opencollective", + "url": "https://opencollective.com/core-js" } }, - "node_modules/argparse": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/argparse/-/argparse-2.0.1.tgz", - "integrity": "sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q==", - "dev": true, - "license": "Python-2.0" - }, - "node_modules/aria-query": { - "version": "5.3.0", - "resolved": "https://registry.npmjs.org/aria-query/-/aria-query-5.3.0.tgz", - "integrity": "sha512-b0P0sZPKtyu8HkeRAfCq0IfURZK+SuwMjY1UXGBU27wpAiTwQAIlq56IbIO+ytk/JjS1fMR14ee5WBBfKi5J6A==", + "node_modules/cors": { + "version": "2.8.6", + "resolved": "https://registry.npmjs.org/cors/-/cors-2.8.6.tgz", + "integrity": "sha512-tJtZBBHA6vjIAaF6EnIaq6laBBP9aq/Y3ouVJjEfoHbRBcHBAHYcMh/w8LDrk2PvIMMq8gmopa5D4V8RmbrxGw==", "dev": true, - "license": "Apache-2.0", + "license": "MIT", "dependencies": { - "dequal": "^2.0.3" + "object-assign": "^4", + "vary": "^1" + }, + "engines": { + "node": ">= 0.10" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/express" } }, - "node_modules/assertion-error": { + "node_modules/corser": { "version": "2.0.1", - "resolved": "https://registry.npmjs.org/assertion-error/-/assertion-error-2.0.1.tgz", - "integrity": "sha512-Izi8RQcffqCeNVgFigKli1ssklIbpHnCYc6AknXGYoB6grJqyeby7jv12JUQgmTAnIDnbck1uxksT4dzN3PWBA==", + "resolved": "https://registry.npmjs.org/corser/-/corser-2.0.1.tgz", + "integrity": "sha512-utCYNzRSQIZNPIcGZdQc92UVJYAhtGAteCFg0yRaFm8f0P+CPtyGyHXJcGXnffjCybUCEx3FQ2G7U3/o9eIkVQ==", "dev": true, "license": "MIT", "engines": { - "node": ">=12" + "node": ">= 0.4.0" } }, - "node_modules/ast-types": { - "version": "0.16.1", - "resolved": "https://registry.npmjs.org/ast-types/-/ast-types-0.16.1.tgz", - "integrity": "sha512-6t10qk83GOG8p0vKmaCr8eiilZwO171AvbROMtvvNiwrTly62t+7XkA8RdIIVbpMhCASAsxgAzdRSwh6nw/5Dg==", + "node_modules/cose-base": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/cose-base/-/cose-base-1.0.3.tgz", + "integrity": "sha512-s9whTXInMSgAp/NVXVNuVxVKzGH2qck3aQlVHxDCdAEPgtMKwc4Wq6/QKhgdEdgbLSi9rBTAcPoRa6JpiG4ksg==", "dev": true, "license": "MIT", "dependencies": { - "tslib": "^2.0.1" + "layout-base": "^1.0.0" + } + }, + "node_modules/cross-spawn": { + "version": "7.0.6", + "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.6.tgz", + "integrity": "sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==", + "dev": true, + "license": "MIT", + "dependencies": { + "path-key": "^3.1.0", + "shebang-command": "^2.0.0", + "which": "^2.0.1" }, "engines": { - "node": ">=4" + "node": ">= 8" } }, - "node_modules/ast-v8-to-istanbul": { - "version": "1.0.3", - "resolved": "https://registry.npmjs.org/ast-v8-to-istanbul/-/ast-v8-to-istanbul-1.0.3.tgz", - "integrity": "sha512-jCMQ6ZylLPudp0CDfBmQBZUsrh1/8psbmu9ibeVWKuHWD0YrH9YABwlKu5kVEFoT0GCQQW9Z/SxfuEbbkGQCRg==", + "node_modules/crypto-random-string": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/crypto-random-string/-/crypto-random-string-2.0.0.tgz", + "integrity": "sha512-v1plID3y9r/lPhviJ1wrXpLeyUIGAZ2SHNYTEapm7/8A9nLPoyvVp3RK/EPFqn5kEznyWgYZNsRtYYIWbuG8KA==", "dev": true, "license": "MIT", - "dependencies": { - "@jridgewell/trace-mapping": "^0.3.31", - "estree-walker": "^3.0.3", - "js-tokens": "^10.0.0" + "engines": { + "node": ">=8" } }, - "node_modules/async": { - "version": "3.2.6", - "resolved": "https://registry.npmjs.org/async/-/async-3.2.6.tgz", - "integrity": "sha512-htCUDlxyyCLMgaM3xXg0C0LW2xqfuQ6p05pCEIsXuyQ+a1koYKTuBMzRNwmybfLgvJDMd0r1LTn4+E0Ti6C2AA==", + "node_modules/css.escape": { + "version": "1.5.1", + "resolved": "https://registry.npmjs.org/css.escape/-/css.escape-1.5.1.tgz", + "integrity": "sha512-YUifsXXuknHlUsmlgyY0PKzgPOr7/FjCePfHNt0jxm83wHZi44VDMQ7/fGNkjY3/jV1MC+1CmZbaHzugyeRtpg==", "dev": true, "license": "MIT" }, - "node_modules/axe-core": { - "version": "4.10.3", - "resolved": "https://registry.npmjs.org/axe-core/-/axe-core-4.10.3.tgz", - "integrity": "sha512-Xm7bpRXnDSX2YE2YFfBk2FnF0ep6tmG7xPh8iHee8MIcrgq762Nkce856dYtJYLkuIoYZvGfTs/PbZhideTcEg==", + "node_modules/cssesc": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/cssesc/-/cssesc-3.0.0.tgz", + "integrity": "sha512-/Tb/JcjK111nNScGob5MNtsntNM1aCNUDipB/TkwZFhyDrrE47SOx/18wF2bbjgc3ZzCSKW1T5nt5EbFoAz/Vg==", "dev": true, - "license": "MPL-2.0", + "license": "MIT", + "bin": { + "cssesc": "bin/cssesc" + }, "engines": { "node": ">=4" } }, - "node_modules/axobject-query": { - "version": "4.1.0", - "resolved": "https://registry.npmjs.org/axobject-query/-/axobject-query-4.1.0.tgz", - "integrity": "sha512-qIj0G9wZbMGNLjLmg1PT6v2mE9AH2zlnADJD/2tC6E00hgmhUOfEB6greHPAfLRSufHqROIUTkw6E+M3lH0PTQ==", + "node_modules/csstype": { + "version": "3.2.3", + "resolved": "https://registry.npmjs.org/csstype/-/csstype-3.2.3.tgz", + "integrity": "sha512-z1HGKcYy2xA8AGQfwrn0PAy+PB7X/GSj3UVJW9qKyn43xWa+gl5nXmU4qqLMRzWVLFC8KusUX8T/0kCiOYpAIQ==", "dev": true, - "license": "Apache-2.0", - "engines": { - "node": ">= 0.4" - } + "license": "MIT", + "peer": true }, - "node_modules/bail": { - "version": "2.0.2", - "resolved": "https://registry.npmjs.org/bail/-/bail-2.0.2.tgz", - "integrity": "sha512-0xO6mYd7JB2YesxDKplafRpsiOzPt9V02ddPCLbY1xYGPOX24NTyN50qnUxgCPcSoYMhKpAuBTjQoRZCAkUDRw==", + "node_modules/cytoscape": { + "version": "3.34.0", + "resolved": "https://registry.npmjs.org/cytoscape/-/cytoscape-3.34.0.tgz", + "integrity": "sha512-62rNSrioXw93uliKFBwjukeQyeWwH2PqDrTac31r2P6464u3AUvTk0xS4LVvT251g7IgkFunrI48ZEZGjywSOg==", "dev": true, "license": "MIT", - "funding": { - "type": "github", - "url": "https://github.com/sponsors/wooorm" + "engines": { + "node": ">=0.10" } }, - "node_modules/balanced-match": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.2.tgz", - "integrity": "sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==", - "dev": true, - "license": "MIT" - }, - "node_modules/basic-auth": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/basic-auth/-/basic-auth-2.0.1.tgz", - "integrity": "sha512-NF+epuEdnUYVlGuhaxbbq+dvJttwLnGY+YixlXlME5KpQ5W3CnXA5cVTneY3SPbPDRkcjMbifrwmFYcClgOZeg==", + "node_modules/cytoscape-cose-bilkent": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/cytoscape-cose-bilkent/-/cytoscape-cose-bilkent-4.1.0.tgz", + "integrity": "sha512-wgQlVIUJF13Quxiv5e1gstZ08rnZj2XaLHGoFMYXz7SkNfCDOOteKBE6SYRfA9WxxI/iBc3ajfDoc6hb/MRAHQ==", "dev": true, "license": "MIT", "dependencies": { - "safe-buffer": "5.1.2" + "cose-base": "^1.0.0" }, - "engines": { - "node": ">= 0.8" + "peerDependencies": { + "cytoscape": "^3.2.0" } }, - "node_modules/bits-ui": { - "version": "2.18.1", - "resolved": "https://registry.npmjs.org/bits-ui/-/bits-ui-2.18.1.tgz", - "integrity": "sha512-KkemzKFH4T3gt3H+P86JcnAWExjByv/6vlwjm/BoCwTPHu03yiCdxbghdJLvFReQTe0acCAiRcKfmixxD6XvlA==", + "node_modules/cytoscape-fcose": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/cytoscape-fcose/-/cytoscape-fcose-2.2.0.tgz", + "integrity": "sha512-ki1/VuRIHFCzxWNrsshHYPs6L7TvLu3DL+TyIGEsRcvVERmxokbf5Gdk7mFxZnTdiGtnA4cfSmjZJMviqSuZrQ==", "dev": true, "license": "MIT", "dependencies": { - "@floating-ui/core": "^1.7.1", - "@floating-ui/dom": "^1.7.1", - "esm-env": "^1.1.2", - "runed": "^0.35.1", - "svelte-toolbelt": "^0.10.6", - "tabbable": "^6.2.0" - }, - "engines": { - "node": ">=20" - }, - "funding": { - "url": "https://github.com/sponsors/huntabyte" + "cose-base": "^2.2.0" }, "peerDependencies": { - "@internationalized/date": "^3.8.1", - "svelte": "^5.33.0" + "cytoscape": "^3.2.0" } }, - "node_modules/bits-ui/node_modules/runed": { - "version": "0.35.1", - "resolved": "https://registry.npmjs.org/runed/-/runed-0.35.1.tgz", - "integrity": "sha512-2F4Q/FZzbeJTFdIS/PuOoPRSm92sA2LhzTnv6FXhCoENb3huf5+fDuNOg1LNvGOouy3u/225qxmuJvcV3IZK5Q==", + "node_modules/cytoscape-fcose/node_modules/cose-base": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/cose-base/-/cose-base-2.2.0.tgz", + "integrity": "sha512-AzlgcsCbUMymkADOJtQm3wO9S3ltPfYOFD5033keQn9NJzIbtnZj+UdBJe7DYml/8TdbtHJW3j58SOnKhWY/5g==", "dev": true, - "funding": [ - "https://github.com/sponsors/huntabyte", - "https://github.com/sponsors/tglide" - ], "license": "MIT", "dependencies": { - "dequal": "^2.0.3", - "esm-env": "^1.0.0", - "lz-string": "^1.5.0" - }, - "peerDependencies": { - "@sveltejs/kit": "^2.21.0", - "svelte": "^5.7.0" - }, - "peerDependenciesMeta": { - "@sveltejs/kit": { - "optional": true - } + "layout-base": "^2.0.0" } }, - "node_modules/bits-ui/node_modules/svelte-toolbelt": { - "version": "0.10.6", - "resolved": "https://registry.npmjs.org/svelte-toolbelt/-/svelte-toolbelt-0.10.6.tgz", - "integrity": "sha512-YWuX+RE+CnWYx09yseAe4ZVMM7e7GRFZM6OYWpBKOb++s+SQ8RBIMMe+Bs/CznBMc0QPLjr+vDBxTAkozXsFXQ==", + "node_modules/cytoscape-fcose/node_modules/layout-base": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/layout-base/-/layout-base-2.0.1.tgz", + "integrity": "sha512-dp3s92+uNI1hWIpPGH3jK2kxE2lMjdXdr+DH8ynZHpd6PUlH6x6cbuXnoMmiNumznqaNO31xu9e79F0uuZ0JFg==", "dev": true, - "funding": [ - "https://github.com/sponsors/huntabyte" - ], + "license": "MIT" + }, + "node_modules/d3": { + "version": "7.9.0", + "resolved": "https://registry.npmjs.org/d3/-/d3-7.9.0.tgz", + "integrity": "sha512-e1U46jVP+w7Iut8Jt8ri1YsPOvFpg46k+K8TpCb0P+zjCkjkPnV7WzfDJzMHy1LnA+wj5pLT1wjO901gLXeEhA==", + "dev": true, + "license": "ISC", "dependencies": { - "clsx": "^2.1.1", - "runed": "^0.35.1", - "style-to-object": "^1.0.8" + "d3-array": "3", + "d3-axis": "3", + "d3-brush": "3", + "d3-chord": "3", + "d3-color": "3", + "d3-contour": "4", + "d3-delaunay": "6", + "d3-dispatch": "3", + "d3-drag": "3", + "d3-dsv": "3", + "d3-ease": "3", + "d3-fetch": "3", + "d3-force": "3", + "d3-format": "3", + "d3-geo": "3", + "d3-hierarchy": "3", + "d3-interpolate": "3", + "d3-path": "3", + "d3-polygon": "3", + "d3-quadtree": "3", + "d3-random": "3", + "d3-scale": "4", + "d3-scale-chromatic": "3", + "d3-selection": "3", + "d3-shape": "3", + "d3-time": "3", + "d3-time-format": "4", + "d3-timer": "3", + "d3-transition": "3", + "d3-zoom": "3" }, "engines": { - "node": ">=18", - "pnpm": ">=8.7.0" - }, - "peerDependencies": { - "svelte": "^5.30.2" + "node": ">=12" } }, - "node_modules/body-parser": { - "version": "2.2.2", - "resolved": "https://registry.npmjs.org/body-parser/-/body-parser-2.2.2.tgz", - "integrity": "sha512-oP5VkATKlNwcgvxi0vM0p/D3n2C3EReYVX+DNYs5TjZFn/oQt2j+4sVJtSMr18pdRr8wjTcBl6LoV+FUwzPmNA==", + "node_modules/d3-array": { + "version": "3.2.4", + "resolved": "https://registry.npmjs.org/d3-array/-/d3-array-3.2.4.tgz", + "integrity": "sha512-tdQAmyA18i4J7wprpYq8ClcxZy3SC31QMeByyCFyRt7BVHdREQZ5lpzoe5mFEYZUWe+oq8HBvk9JjpibyEV4Jg==", "dev": true, - "license": "MIT", + "license": "ISC", "dependencies": { - "bytes": "^3.1.2", - "content-type": "^1.0.5", - "debug": "^4.4.3", - "http-errors": "^2.0.0", - "iconv-lite": "^0.7.0", - "on-finished": "^2.4.1", - "qs": "^6.14.1", - "raw-body": "^3.0.1", - "type-is": "^2.0.1" + "internmap": "1 - 2" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-axis": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/d3-axis/-/d3-axis-3.0.0.tgz", + "integrity": "sha512-IH5tgjV4jE/GhHkRV0HiVYPDtvfjHQlQfJHs0usq7M30XcSBvOotpmH1IgkcXsO/5gEQZD43B//fc7SRT5S+xw==", + "dev": true, + "license": "ISC", + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-brush": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/d3-brush/-/d3-brush-3.0.0.tgz", + "integrity": "sha512-ALnjWlVYkXsVIGlOsuWH1+3udkYFI48Ljihfnh8FZPF2QS9o+PzGLBslO0PjzVoHLZ2KCVgAM8NVkXPJB2aNnQ==", + "dev": true, + "license": "ISC", + "dependencies": { + "d3-dispatch": "1 - 3", + "d3-drag": "2 - 3", + "d3-interpolate": "1 - 3", + "d3-selection": "3", + "d3-transition": "3" }, "engines": { - "node": ">=18" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/express" + "node": ">=12" } }, - "node_modules/body-parser/node_modules/iconv-lite": { - "version": "0.7.2", - "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.7.2.tgz", - "integrity": "sha512-im9DjEDQ55s9fL4EYzOAv0yMqmMBSZp6G0VvFyTMPKWxiSBHUj9NW/qqLmXUwXrrM7AvqSlTCfvqRb0cM8yYqw==", + "node_modules/d3-chord": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/d3-chord/-/d3-chord-3.0.1.tgz", + "integrity": "sha512-VE5S6TNa+j8msksl7HwjxMHDM2yNK3XCkusIlpX5kwauBfXuyLAtNg9jCp/iHH61tgI4sb6R/EIMWCqEIdjT/g==", "dev": true, - "license": "MIT", + "license": "ISC", "dependencies": { - "safer-buffer": ">= 2.1.2 < 3.0.0" + "d3-path": "1 - 3" }, "engines": { - "node": ">=0.10.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/express" + "node": ">=12" } }, - "node_modules/brace-expansion": { - "version": "1.1.13", - "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.13.tgz", - "integrity": "sha512-9ZLprWS6EENmhEOpjCYW2c8VkmOvckIJZfkr7rBW6dObmfgJ/L1GpSYW5Hpo9lDz4D1+n0Ckz8rU7FwHDQiG/w==", + "node_modules/d3-color": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/d3-color/-/d3-color-3.1.0.tgz", + "integrity": "sha512-zg/chbXyeBtMQ1LbD/WSoW2DpC3I0mpmPdW+ynRTj/x2DAWYrIY7qeZIHidozwV24m4iavr15lNwIwLxRmOxhA==", "dev": true, - "license": "MIT", - "dependencies": { - "balanced-match": "^1.0.0", - "concat-map": "0.0.1" + "license": "ISC", + "engines": { + "node": ">=12" } }, - "node_modules/braces": { - "version": "3.0.3", - "resolved": "https://registry.npmjs.org/braces/-/braces-3.0.3.tgz", - "integrity": "sha512-yQbXgO/OSZVD2IsiLlro+7Hf6Q18EJrKSEsdoMzKePKXct3gvD8oLcOQdIzGupr5Fj+EDe8gO/lxc1BzfMpxvA==", + "node_modules/d3-contour": { + "version": "4.0.2", + "resolved": "https://registry.npmjs.org/d3-contour/-/d3-contour-4.0.2.tgz", + "integrity": "sha512-4EzFTRIikzs47RGmdxbeUvLWtGedDUNkTcmzoeyg4sP/dvCexO47AaQL7VKy/gul85TOxw+IBgA8US2xwbToNA==", "dev": true, - "license": "MIT", - "optional": true, + "license": "ISC", "dependencies": { - "fill-range": "^7.1.1" + "d3-array": "^3.2.0" }, "engines": { - "node": ">=8" + "node": ">=12" } }, - "node_modules/bundle-name": { - "version": "4.1.0", - "resolved": "https://registry.npmjs.org/bundle-name/-/bundle-name-4.1.0.tgz", - "integrity": "sha512-tjwM5exMg6BGRI+kNmTntNsvdZS1X8BFYS6tnJ2hdH0kVxM6/eVZ2xy+FqStSWvYmtfFMDLIxurorHwDKfDz5Q==", + "node_modules/d3-delaunay": { + "version": "6.0.4", + "resolved": "https://registry.npmjs.org/d3-delaunay/-/d3-delaunay-6.0.4.tgz", + "integrity": "sha512-mdjtIZ1XLAM8bm/hx3WwjfHt6Sggek7qH043O8KEjDXN40xi3vx/6pYSVTwLjEgiXQTbvaouWKynLBiUZ6SK6A==", "dev": true, - "license": "MIT", + "license": "ISC", "dependencies": { - "run-applescript": "^7.0.0" + "delaunator": "5" }, "engines": { - "node": ">=18" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" + "node": ">=12" } }, - "node_modules/bytes": { - "version": "3.1.2", - "resolved": "https://registry.npmjs.org/bytes/-/bytes-3.1.2.tgz", - "integrity": "sha512-/Nf7TyzTx6S3yRJObOAV7956r8cr2+Oj8AC5dt8wSP3BQAoeX58NoHyCU8P8zGkNXStjTSi6fzO6F0pBdcYbEg==", + "node_modules/d3-dispatch": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/d3-dispatch/-/d3-dispatch-3.0.1.tgz", + "integrity": "sha512-rzUyPU/S7rwUflMyLc1ETDeBj0NRuHKKAcvukozwhshr6g6c5d8zh4c2gQjY2bZ0dXeGLWc1PF174P2tVvKhfg==", "dev": true, - "license": "MIT", + "license": "ISC", "engines": { - "node": ">= 0.8" + "node": ">=12" } }, - "node_modules/call-bind-apply-helpers": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/call-bind-apply-helpers/-/call-bind-apply-helpers-1.0.2.tgz", - "integrity": "sha512-Sp1ablJ0ivDkSzjcaJdxEunN5/XvksFJ2sMBFfq6x0ryhQV/2b/KwFe21cMpmHtPOSij8K99/wSfoEuTObmuMQ==", + "node_modules/d3-drag": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/d3-drag/-/d3-drag-3.0.0.tgz", + "integrity": "sha512-pWbUJLdETVA8lQNJecMxoXfH6x+mO2UQo8rSmZ+QqxcbyA3hfeprFgIT//HW2nlHChWeIIMwS2Fq+gEARkhTkg==", "dev": true, - "license": "MIT", + "license": "ISC", "dependencies": { - "es-errors": "^1.3.0", - "function-bind": "^1.1.2" + "d3-dispatch": "1 - 3", + "d3-selection": "3" }, "engines": { - "node": ">= 0.4" + "node": ">=12" } }, - "node_modules/call-bound": { - "version": "1.0.4", - "resolved": "https://registry.npmjs.org/call-bound/-/call-bound-1.0.4.tgz", - "integrity": "sha512-+ys997U96po4Kx/ABpBCqhA9EuxJaQWDQg7295H4hBphv3IZg0boBKuwYpt4YXp6MZ5AmZQnU/tyMTlRpaSejg==", + "node_modules/d3-dsv": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/d3-dsv/-/d3-dsv-3.0.1.tgz", + "integrity": "sha512-UG6OvdI5afDIFP9w4G0mNq50dSOsXHJaRE8arAS5o9ApWnIElp8GZw1Dun8vP8OyHOZ/QJUKUJwxiiCCnUwm+Q==", "dev": true, - "license": "MIT", + "license": "ISC", "dependencies": { - "call-bind-apply-helpers": "^1.0.2", - "get-intrinsic": "^1.3.0" + "commander": "7", + "iconv-lite": "0.6", + "rw": "1" }, - "engines": { - "node": ">= 0.4" + "bin": { + "csv2json": "bin/dsv2json.js", + "csv2tsv": "bin/dsv2dsv.js", + "dsv2dsv": "bin/dsv2dsv.js", + "dsv2json": "bin/dsv2json.js", + "json2csv": "bin/json2dsv.js", + "json2dsv": "bin/json2dsv.js", + "json2tsv": "bin/json2dsv.js", + "tsv2csv": "bin/dsv2dsv.js", + "tsv2json": "bin/dsv2json.js" }, - "funding": { - "url": "https://github.com/sponsors/ljharb" + "engines": { + "node": ">=12" } }, - "node_modules/callsites": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/callsites/-/callsites-3.1.0.tgz", - "integrity": "sha512-P8BjAsXvZS+VIDUI11hHCQEv74YT67YUi5JJFNWIqL235sBmjX4+qx9Muvls5ivyNENctx46xQLQ3aTuE7ssaQ==", + "node_modules/d3-dsv/node_modules/iconv-lite": { + "version": "0.6.3", + "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.3.tgz", + "integrity": "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==", "dev": true, "license": "MIT", + "dependencies": { + "safer-buffer": ">= 2.1.2 < 3.0.0" + }, "engines": { - "node": ">=6" + "node": ">=0.10.0" } }, - "node_modules/ccount": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/ccount/-/ccount-2.0.1.tgz", - "integrity": "sha512-eyrF0jiFpY+3drT6383f1qhkbGsLSifNAjA61IUjZjmLCWjItY6LB9ft9YhoDgwfmclB2zhu51Lc7+95b8NRAg==", + "node_modules/d3-ease": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/d3-ease/-/d3-ease-3.0.1.tgz", + "integrity": "sha512-wR/XK3D3XcLIZwpbvQwQ5fK+8Ykds1ip7A2Txe0yxncXSdq1L9skcG7blcedkOX+ZcgxGAmLX1FrRGbADwzi0w==", "dev": true, - "license": "MIT", - "funding": { - "type": "github", - "url": "https://github.com/sponsors/wooorm" + "license": "BSD-3-Clause", + "engines": { + "node": ">=12" } }, - "node_modules/chai": { - "version": "5.2.1", - "resolved": "https://registry.npmjs.org/chai/-/chai-5.2.1.tgz", - "integrity": "sha512-5nFxhUrX0PqtyogoYOA8IPswy5sZFTOsBFl/9bNsmDLgsxYTzSZQJDPppDnZPTQbzSEm0hqGjWPzRemQCYbD6A==", + "node_modules/d3-fetch": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/d3-fetch/-/d3-fetch-3.0.1.tgz", + "integrity": "sha512-kpkQIM20n3oLVBKGg6oHrUchHM3xODkTzjMoj7aWQFq5QEM+R6E4WkzT5+tojDY7yjez8KgCBRoj4aEr99Fdqw==", "dev": true, - "license": "MIT", + "license": "ISC", "dependencies": { - "assertion-error": "^2.0.1", - "check-error": "^2.1.1", - "deep-eql": "^5.0.1", - "loupe": "^3.1.0", - "pathval": "^2.0.0" + "d3-dsv": "1 - 3" }, "engines": { - "node": ">=18" + "node": ">=12" } }, - "node_modules/chalk": { - "version": "4.1.2", - "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz", - "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==", + "node_modules/d3-force": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/d3-force/-/d3-force-3.0.0.tgz", + "integrity": "sha512-zxV/SsA+U4yte8051P4ECydjD/S+qeYtnaIyAs9tgHCqfguma/aAQDjo85A9Z6EKhBirHRJHXIgJUlffT4wdLg==", "dev": true, - "license": "MIT", + "license": "ISC", "dependencies": { - "ansi-styles": "^4.1.0", - "supports-color": "^7.1.0" + "d3-dispatch": "1 - 3", + "d3-quadtree": "1 - 3", + "d3-timer": "1 - 3" }, "engines": { - "node": ">=10" - }, - "funding": { - "url": "https://github.com/chalk/chalk?sponsor=1" + "node": ">=12" } }, - "node_modules/character-entities": { - "version": "2.0.2", - "resolved": "https://registry.npmjs.org/character-entities/-/character-entities-2.0.2.tgz", - "integrity": "sha512-shx7oQ0Awen/BRIdkjkvz54PnEEI/EjwXDSIZp86/KKdbafHh1Df/RYGBhn4hbe2+uKC9FnT5UCEdyPz3ai9hQ==", + "node_modules/d3-format": { + "version": "3.1.2", + "resolved": "https://registry.npmjs.org/d3-format/-/d3-format-3.1.2.tgz", + "integrity": "sha512-AJDdYOdnyRDV5b6ArilzCPPwc1ejkHcoyFarqlPqT7zRYjhavcT3uSrqcMvsgh2CgoPbK3RCwyHaVyxYcP2Arg==", "dev": true, - "license": "MIT", - "funding": { - "type": "github", - "url": "https://github.com/sponsors/wooorm" + "license": "ISC", + "engines": { + "node": ">=12" } }, - "node_modules/character-entities-html4": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/character-entities-html4/-/character-entities-html4-2.1.0.tgz", - "integrity": "sha512-1v7fgQRj6hnSwFpq1Eu0ynr/CDEw0rXo2B61qXrLNdHZmPKgb7fqS1a2JwF0rISo9q77jDI8VMEHoApn8qDoZA==", + "node_modules/d3-geo": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/d3-geo/-/d3-geo-3.1.1.tgz", + "integrity": "sha512-637ln3gXKXOwhalDzinUgY83KzNWZRKbYubaG+fGVuc/dxO64RRljtCTnf5ecMyE1RIdtqpkVcq0IbtU2S8j2Q==", "dev": true, - "license": "MIT", - "funding": { - "type": "github", - "url": "https://github.com/sponsors/wooorm" + "license": "ISC", + "dependencies": { + "d3-array": "2.5.0 - 3" + }, + "engines": { + "node": ">=12" } }, - "node_modules/character-entities-legacy": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/character-entities-legacy/-/character-entities-legacy-3.0.0.tgz", - "integrity": "sha512-RpPp0asT/6ufRm//AJVwpViZbGM/MkjQFxJccQRHmISF/22NBtsHqAWmL+/pmkPWoIUJdWyeVleTl1wydHATVQ==", + "node_modules/d3-hierarchy": { + "version": "3.1.2", + "resolved": "https://registry.npmjs.org/d3-hierarchy/-/d3-hierarchy-3.1.2.tgz", + "integrity": "sha512-FX/9frcub54beBdugHjDCdikxThEqjnR93Qt7PvQTOHxyiNCAlvMrHhclk3cD5VeAaq9fxmfRp+CnWw9rEMBuA==", "dev": true, - "license": "MIT", - "funding": { - "type": "github", - "url": "https://github.com/sponsors/wooorm" + "license": "ISC", + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-interpolate": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/d3-interpolate/-/d3-interpolate-3.0.1.tgz", + "integrity": "sha512-3bYs1rOD33uo8aqJfKP3JWPAibgw8Zm2+L9vBKEHJ2Rg+viTR7o5Mmv5mZcieN+FRYaAOWX5SJATX6k1PWz72g==", + "dev": true, + "license": "ISC", + "dependencies": { + "d3-color": "1 - 3" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/d3-path": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/d3-path/-/d3-path-3.1.0.tgz", + "integrity": "sha512-p3KP5HCf/bvjBSSKuXid6Zqijx7wIfNW+J/maPs+iwR35at5JCbLUT0LzF1cnjbCHWhqzQTIN2Jpe8pRebIEFQ==", + "dev": true, + "license": "ISC", + "engines": { + "node": ">=12" } }, - "node_modules/check-error": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/check-error/-/check-error-2.1.1.tgz", - "integrity": "sha512-OAlb+T7V4Op9OwdkjmguYRqncdlx5JiofwOAUkmTF+jNdHwzTaTs4sRAGpzLF3oOz5xAyDGrPgeIDFQmDOTiJw==", + "node_modules/d3-polygon": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/d3-polygon/-/d3-polygon-3.0.1.tgz", + "integrity": "sha512-3vbA7vXYwfe1SYhED++fPUQlWSYTTGmFmQiany/gdbiWgU/iEyQzyymwL9SkJjFFuCS4902BSzewVGsHHmHtXg==", "dev": true, - "license": "MIT", + "license": "ISC", "engines": { - "node": ">= 16" + "node": ">=12" } }, - "node_modules/chokidar": { - "version": "4.0.3", - "resolved": "https://registry.npmjs.org/chokidar/-/chokidar-4.0.3.tgz", - "integrity": "sha512-Qgzu8kfBvo+cA4962jnP1KkS6Dop5NS6g7R5LFYJr4b8Ub94PPQXUksCw9PvXoeXPRRddRNC5C1JQUR2SMGtnA==", + "node_modules/d3-quadtree": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/d3-quadtree/-/d3-quadtree-3.0.1.tgz", + "integrity": "sha512-04xDrxQTDTCFwP5H6hRhsRcb9xxv2RzkcsygFzmkSIOJy3PeRJP7sNk3VRIbKXcog561P9oU0/rVH6vDROAgUw==", "dev": true, - "license": "MIT", - "dependencies": { - "readdirp": "^4.0.1" - }, + "license": "ISC", "engines": { - "node": ">= 14.16.0" - }, - "funding": { - "url": "https://paulmillr.com/funding/" + "node": ">=12" } }, - "node_modules/chownr": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/chownr/-/chownr-3.0.0.tgz", - "integrity": "sha512-+IxzY9BZOQd/XuYPRmrvEVjF/nqj5kgT4kEq7VofrDoM1MxoRjEWkrCC3EtLi59TVawxTAn+orJwFQcrqEN1+g==", + "node_modules/d3-random": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/d3-random/-/d3-random-3.0.1.tgz", + "integrity": "sha512-FXMe9GfxTxqd5D6jFsQ+DJ8BJS4E/fT5mqqdjovykEB2oFbTMDVdg1MGFxfQW+FBOGoB++k8swBrgwSHT1cUXQ==", "dev": true, - "license": "BlueOak-1.0.0", + "license": "ISC", "engines": { - "node": ">=18" + "node": ">=12" } }, - "node_modules/chromatic": { - "version": "13.3.5", - "resolved": "https://registry.npmjs.org/chromatic/-/chromatic-13.3.5.tgz", - "integrity": "sha512-MzPhxpl838qJUo0A55osCF2ifwPbjcIPeElr1d4SHcjnHoIcg7l1syJDrAYK/a+PcCBrOGi06jPNpQAln5hWgw==", + "node_modules/d3-sankey": { + "version": "0.12.3", + "resolved": "https://registry.npmjs.org/d3-sankey/-/d3-sankey-0.12.3.tgz", + "integrity": "sha512-nQhsBRmM19Ax5xEIPLMY9ZmJ/cDvd1BG3UVvt5h3WRxKg5zGRbvnteTyWAbzeSvlh3tW7ZEmq4VwR5mB3tutmQ==", "dev": true, - "license": "MIT", - "bin": { - "chroma": "dist/bin.js", - "chromatic": "dist/bin.js", - "chromatic-cli": "dist/bin.js" - }, - "peerDependencies": { - "@chromatic-com/cypress": "^0.*.* || ^1.0.0", - "@chromatic-com/playwright": "^0.*.* || ^1.0.0" - }, - "peerDependenciesMeta": { - "@chromatic-com/cypress": { - "optional": true - }, - "@chromatic-com/playwright": { - "optional": true - } + "license": "BSD-3-Clause", + "dependencies": { + "d3-array": "1 - 2", + "d3-shape": "^1.2.0" } }, - "node_modules/clsx": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/clsx/-/clsx-2.1.1.tgz", - "integrity": "sha512-eYm0QWBtUrBWZWG0d386OGAw16Z995PiOVo2B7bjWSbHedGl5e0ZWaq65kOGgUSNesEIDkB9ISbTg/JK9dhCZA==", + "node_modules/d3-sankey/node_modules/d3-array": { + "version": "2.12.1", + "resolved": "https://registry.npmjs.org/d3-array/-/d3-array-2.12.1.tgz", + "integrity": "sha512-B0ErZK/66mHtEsR1TkPEEkwdy+WDesimkM5gpZr5Dsg54BiTA5RXtYW5qTLIAcekaS9xfZrzBLF/OAkB3Qn1YQ==", "dev": true, - "license": "MIT", - "engines": { - "node": ">=6" + "license": "BSD-3-Clause", + "dependencies": { + "internmap": "^1.0.0" } }, - "node_modules/color-convert": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz", - "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==", + "node_modules/d3-sankey/node_modules/d3-path": { + "version": "1.0.9", + "resolved": "https://registry.npmjs.org/d3-path/-/d3-path-1.0.9.tgz", + "integrity": "sha512-VLaYcn81dtHVTjEHd8B+pbe9yHWpXKZUC87PzoFmsFrJqgFwDe/qxfp5MlfsfM1V5E/iVt0MmEbWQ7FVIXh/bg==", "dev": true, - "license": "MIT", + "license": "BSD-3-Clause" + }, + "node_modules/d3-sankey/node_modules/d3-shape": { + "version": "1.3.7", + "resolved": "https://registry.npmjs.org/d3-shape/-/d3-shape-1.3.7.tgz", + "integrity": "sha512-EUkvKjqPFUAZyOlhY5gzCxCeI0Aep04LwIRpsZ/mLFelJiUfnK56jo5JMDSE7yyP2kLSb6LtF+S5chMk7uqPqw==", + "dev": true, + "license": "BSD-3-Clause", "dependencies": { - "color-name": "~1.1.4" - }, - "engines": { - "node": ">=7.0.0" + "d3-path": "1" } }, - "node_modules/color-name": { - "version": "1.1.4", - "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz", - "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==", + "node_modules/d3-sankey/node_modules/internmap": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/internmap/-/internmap-1.0.1.tgz", + "integrity": "sha512-lDB5YccMydFBtasVtxnZ3MRBHuaoE8GKsppq+EchKL2U4nK/DmEpPHNH8MZe5HkMtpSiTSOZwfN0tzYjO/lJEw==", "dev": true, - "license": "MIT" + "license": "ISC" }, - "node_modules/comma-separated-tokens": { - "version": "2.0.3", - "resolved": "https://registry.npmjs.org/comma-separated-tokens/-/comma-separated-tokens-2.0.3.tgz", - "integrity": "sha512-Fu4hJdvzeylCfQPp9SGWidpzrMs7tTrlu6Vb8XGaRGck8QSNZJJp538Wrb60Lax4fPwR64ViY468OIUTbRlGZg==", + "node_modules/d3-scale": { + "version": "4.0.2", + "resolved": "https://registry.npmjs.org/d3-scale/-/d3-scale-4.0.2.tgz", + "integrity": "sha512-GZW464g1SH7ag3Y7hXjf8RoUuAFIqklOAq3MRl4OaWabTFJY9PN/E1YklhXLh+OQ3fM9yS2nOkCoS+WLZ6kvxQ==", "dev": true, - "license": "MIT", - "funding": { - "type": "github", - "url": "https://github.com/sponsors/wooorm" + "license": "ISC", + "dependencies": { + "d3-array": "2.10.0 - 3", + "d3-format": "1 - 3", + "d3-interpolate": "1.2.0 - 3", + "d3-time": "2.1.1 - 3", + "d3-time-format": "2 - 4" + }, + "engines": { + "node": ">=12" } }, - "node_modules/commander": { - "version": "8.3.0", - "resolved": "https://registry.npmjs.org/commander/-/commander-8.3.0.tgz", - "integrity": "sha512-OkTL9umf+He2DZkUq8f8J9of7yL6RJKI24dVITBmNfZBmri9zYZQrKkuXiKhyfPSu8tUhnVBB1iKXevvnlR4Ww==", + "node_modules/d3-scale-chromatic": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/d3-scale-chromatic/-/d3-scale-chromatic-3.1.0.tgz", + "integrity": "sha512-A3s5PWiZ9YCXFye1o246KoscMWqf8BsD9eRiJ3He7C9OBaxKhAd5TFCdEx/7VbKtxxTsu//1mMJFrEt572cEyQ==", "dev": true, - "license": "MIT", + "license": "ISC", + "dependencies": { + "d3-color": "1 - 3", + "d3-interpolate": "1 - 3" + }, "engines": { - "node": ">= 12" + "node": ">=12" } }, - "node_modules/concat-map": { - "version": "0.0.1", - "resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz", - "integrity": "sha512-/Srv4dswyQNBfohGpz9o6Yb3Gz3SrUDqBH5rTuhGR7ahtlbYKnVxw2bCFMRljaA7EXHaXZ8wsHdodFvbkhKmqg==", + "node_modules/d3-selection": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/d3-selection/-/d3-selection-3.0.0.tgz", + "integrity": "sha512-fmTRWbNMmsmWq6xJV8D19U/gw/bwrHfNXxrIN+HfZgnzqTHp9jOmKMhsTUjXOJnZOdZY9Q28y4yebKzqDKlxlQ==", "dev": true, - "license": "MIT" + "license": "ISC", + "engines": { + "node": ">=12" + } }, - "node_modules/content-disposition": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/content-disposition/-/content-disposition-1.0.1.tgz", - "integrity": "sha512-oIXISMynqSqm241k6kcQ5UwttDILMK4BiurCfGEREw6+X9jkkpEe5T9FZaApyLGGOnFuyMWZpdolTXMtvEJ08Q==", + "node_modules/d3-shape": { + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/d3-shape/-/d3-shape-3.2.0.tgz", + "integrity": "sha512-SaLBuwGm3MOViRq2ABk3eLoxwZELpH6zhl3FbAoJ7Vm1gofKx6El1Ib5z23NUEhF9AsGl7y+dzLe5Cw2AArGTA==", "dev": true, - "license": "MIT", + "license": "ISC", + "dependencies": { + "d3-path": "^3.1.0" + }, "engines": { - "node": ">=18" + "node": ">=12" + } + }, + "node_modules/d3-time": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/d3-time/-/d3-time-3.1.0.tgz", + "integrity": "sha512-VqKjzBLejbSMT4IgbmVgDjpkYrNWUYJnbCGo874u7MMKIWsILRX+OpX/gTk8MqjpT1A/c6HY2dCA77ZN0lkQ2Q==", + "dev": true, + "license": "ISC", + "dependencies": { + "d3-array": "2 - 3" }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/express" + "engines": { + "node": ">=12" } }, - "node_modules/content-type": { - "version": "1.0.5", - "resolved": "https://registry.npmjs.org/content-type/-/content-type-1.0.5.tgz", - "integrity": "sha512-nTjqfcBFEipKdXCv4YDQWCfmcLZKm81ldF0pAopTvyrFGVbcR6P/VAAd5G7N+0tTr8QqiU0tFadD6FK4NtJwOA==", + "node_modules/d3-time-format": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/d3-time-format/-/d3-time-format-4.1.0.tgz", + "integrity": "sha512-dJxPBlzC7NugB2PDLwo9Q8JiTR3M3e4/XANkreKSUxF8vvXKqm1Yfq4Q5dl8budlunRVlUUaDUgFt7eA8D6NLg==", "dev": true, - "license": "MIT", + "license": "ISC", + "dependencies": { + "d3-time": "1 - 3" + }, "engines": { - "node": ">= 0.6" + "node": ">=12" } }, - "node_modules/convert-source-map": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/convert-source-map/-/convert-source-map-2.0.0.tgz", - "integrity": "sha512-Kvp459HrV2FEJ1CAsi1Ku+MY3kasH19TFykTz2xWmMeq6bk2NU3XXvfJ+Q61m0xktWwt+1HSYf3JZsTms3aRJg==", + "node_modules/d3-timer": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/d3-timer/-/d3-timer-3.0.1.tgz", + "integrity": "sha512-ndfJ/JxxMd3nw31uyKoY2naivF+r29V+Lc0svZxe1JvvIRmi8hUsrMvdOwgS1o6uBHmiz91geQ0ylPP0aj1VUA==", "dev": true, - "license": "MIT" + "license": "ISC", + "engines": { + "node": ">=12" + } }, - "node_modules/cookie": { - "version": "1.1.1", - "resolved": "https://registry.npmjs.org/cookie/-/cookie-1.1.1.tgz", - "integrity": "sha512-ei8Aos7ja0weRpFzJnEA9UHJ/7XQmqglbRwnf2ATjcB9Wq874VKH9kfjjirM6UhU2/E5fFYadylyhFldcqSidQ==", + "node_modules/d3-transition": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/d3-transition/-/d3-transition-3.0.1.tgz", + "integrity": "sha512-ApKvfjsSR6tg06xrL434C0WydLr7JewBB3V+/39RMHsaXTOG0zmt/OAXeng5M5LBm0ojmxJrpomQVZ1aPvBL4w==", "dev": true, - "license": "MIT", + "license": "ISC", + "dependencies": { + "d3-color": "1 - 3", + "d3-dispatch": "1 - 3", + "d3-ease": "1 - 3", + "d3-interpolate": "1 - 3", + "d3-timer": "1 - 3" + }, "engines": { - "node": ">=18" + "node": ">=12" }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/express" + "peerDependencies": { + "d3-selection": "2 - 3" } }, - "node_modules/cookie-signature": { - "version": "1.2.2", - "resolved": "https://registry.npmjs.org/cookie-signature/-/cookie-signature-1.2.2.tgz", - "integrity": "sha512-D76uU73ulSXrD1UXF4KE2TMxVVwhsnCgfAyTg9k8P6KGZjlXKrOLe4dJQKI3Bxi5wjesZoFXJWElNWBjPZMbhg==", + "node_modules/d3-zoom": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/d3-zoom/-/d3-zoom-3.0.0.tgz", + "integrity": "sha512-b8AmV3kfQaqWAuacbPuNbL6vahnOJflOhexLzMMNLga62+/nh0JzvJ0aO/5a5MVgUFGS7Hu1P9P03o3fJkDCyw==", "dev": true, - "license": "MIT", + "license": "ISC", + "dependencies": { + "d3-dispatch": "1 - 3", + "d3-drag": "2 - 3", + "d3-interpolate": "1 - 3", + "d3-selection": "2 - 3", + "d3-transition": "2 - 3" + }, "engines": { - "node": ">=6.6.0" + "node": ">=12" } }, - "node_modules/cors": { - "version": "2.8.5", - "resolved": "https://registry.npmjs.org/cors/-/cors-2.8.5.tgz", - "integrity": "sha512-KIHbLJqu73RGr/hnbrO9uBeixNGuvSQjul/jdFvS/KFSIH1hWVd1ng7zOHx+YrEfInLG7q4n6GHQ9cDtxv/P6g==", + "node_modules/dagre-d3-es": { + "version": "7.0.14", + "resolved": "https://registry.npmjs.org/dagre-d3-es/-/dagre-d3-es-7.0.14.tgz", + "integrity": "sha512-P4rFMVq9ESWqmOgK+dlXvOtLwYg0i7u0HBGJER0LZDJT2VHIPAMZ/riPxqJceWMStH5+E61QxFra9kIS3AqdMg==", "dev": true, "license": "MIT", "dependencies": { - "object-assign": "^4", - "vary": "^1" - }, - "engines": { - "node": ">= 0.10" + "d3": "^7.9.0", + "lodash-es": "^4.17.21" } }, - "node_modules/corser": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/corser/-/corser-2.0.1.tgz", - "integrity": "sha512-utCYNzRSQIZNPIcGZdQc92UVJYAhtGAteCFg0yRaFm8f0P+CPtyGyHXJcGXnffjCybUCEx3FQ2G7U3/o9eIkVQ==", + "node_modules/data-view-buffer": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/data-view-buffer/-/data-view-buffer-1.0.2.tgz", + "integrity": "sha512-EmKO5V3OLXh1rtK2wgXRansaK1/mtVdTUEiEI0W8RkvgT05kfxaH29PliLnpLP73yYO6142Q72QNa8Wx/A5CqQ==", "dev": true, "license": "MIT", + "dependencies": { + "call-bound": "^1.0.3", + "es-errors": "^1.3.0", + "is-data-view": "^1.0.2" + }, "engines": { - "node": ">= 0.4.0" + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/cose-base": { - "version": "1.0.3", - "resolved": "https://registry.npmjs.org/cose-base/-/cose-base-1.0.3.tgz", - "integrity": "sha512-s9whTXInMSgAp/NVXVNuVxVKzGH2qck3aQlVHxDCdAEPgtMKwc4Wq6/QKhgdEdgbLSi9rBTAcPoRa6JpiG4ksg==", + "node_modules/data-view-byte-length": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/data-view-byte-length/-/data-view-byte-length-1.0.2.tgz", + "integrity": "sha512-tuhGbE6CfTM9+5ANGf+oQb72Ky/0+s3xKUpHvShfiz2RxMFgFPjsXuRLBVMtvMs15awe45SRb83D6wH4ew6wlQ==", "dev": true, "license": "MIT", "dependencies": { - "layout-base": "^1.0.0" + "call-bound": "^1.0.3", + "es-errors": "^1.3.0", + "is-data-view": "^1.0.2" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/inspect-js" } }, - "node_modules/cross-spawn": { - "version": "7.0.6", - "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.6.tgz", - "integrity": "sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==", + "node_modules/data-view-byte-offset": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/data-view-byte-offset/-/data-view-byte-offset-1.0.1.tgz", + "integrity": "sha512-BS8PfmtDGnrgYdOonGZQdLZslWIeCGFP9tpan0hi1Co2Zr2NKADsvGYA8XxuG/4UWgJ6Cjtv+YJnB6MM69QGlQ==", "dev": true, "license": "MIT", "dependencies": { - "path-key": "^3.1.0", - "shebang-command": "^2.0.0", - "which": "^2.0.1" + "call-bound": "^1.0.2", + "es-errors": "^1.3.0", + "is-data-view": "^1.0.1" }, "engines": { - "node": ">= 8" + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/css.escape": { - "version": "1.5.1", - "resolved": "https://registry.npmjs.org/css.escape/-/css.escape-1.5.1.tgz", - "integrity": "sha512-YUifsXXuknHlUsmlgyY0PKzgPOr7/FjCePfHNt0jxm83wHZi44VDMQ7/fGNkjY3/jV1MC+1CmZbaHzugyeRtpg==", + "node_modules/dayjs": { + "version": "1.11.21", + "resolved": "https://registry.npmjs.org/dayjs/-/dayjs-1.11.21.tgz", + "integrity": "sha512-98IT+HOahAisibz/yjKbzuOBwYcjJ7BCLPzARyHiyEBmRz4fatF+KPJszEHXsGYjUG234aH/cOjW1wwTbKUZlA==", "dev": true, "license": "MIT" }, - "node_modules/cssesc": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/cssesc/-/cssesc-3.0.0.tgz", - "integrity": "sha512-/Tb/JcjK111nNScGob5MNtsntNM1aCNUDipB/TkwZFhyDrrE47SOx/18wF2bbjgc3ZzCSKW1T5nt5EbFoAz/Vg==", + "node_modules/debug": { + "version": "4.4.3", + "resolved": "https://registry.npmjs.org/debug/-/debug-4.4.3.tgz", + "integrity": "sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA==", "dev": true, "license": "MIT", - "bin": { - "cssesc": "bin/cssesc" + "dependencies": { + "ms": "^2.1.3" }, "engines": { - "node": ">=4" + "node": ">=6.0" + }, + "peerDependenciesMeta": { + "supports-color": { + "optional": true + } } }, - "node_modules/csstype": { - "version": "3.2.3", - "resolved": "https://registry.npmjs.org/csstype/-/csstype-3.2.3.tgz", - "integrity": "sha512-z1HGKcYy2xA8AGQfwrn0PAy+PB7X/GSj3UVJW9qKyn43xWa+gl5nXmU4qqLMRzWVLFC8KusUX8T/0kCiOYpAIQ==", - "dev": true, - "license": "MIT", - "peer": true - }, - "node_modules/cytoscape": { - "version": "3.34.0", - "resolved": "https://registry.npmjs.org/cytoscape/-/cytoscape-3.34.0.tgz", - "integrity": "sha512-62rNSrioXw93uliKFBwjukeQyeWwH2PqDrTac31r2P6464u3AUvTk0xS4LVvT251g7IgkFunrI48ZEZGjywSOg==", + "node_modules/decode-bmp": { + "version": "0.2.1", + "resolved": "https://registry.npmjs.org/decode-bmp/-/decode-bmp-0.2.1.tgz", + "integrity": "sha512-NiOaGe+GN0KJqi2STf24hfMkFitDUaIoUU3eKvP/wAbLe8o6FuW5n/x7MHPR0HKvBokp6MQY/j7w8lewEeVCIA==", "dev": true, "license": "MIT", + "dependencies": { + "@canvas/image-data": "^1.0.0", + "to-data-view": "^1.1.0" + }, "engines": { - "node": ">=0.10" + "node": ">=8.6.0" } }, - "node_modules/cytoscape-cose-bilkent": { - "version": "4.1.0", - "resolved": "https://registry.npmjs.org/cytoscape-cose-bilkent/-/cytoscape-cose-bilkent-4.1.0.tgz", - "integrity": "sha512-wgQlVIUJF13Quxiv5e1gstZ08rnZj2XaLHGoFMYXz7SkNfCDOOteKBE6SYRfA9WxxI/iBc3ajfDoc6hb/MRAHQ==", + "node_modules/decode-ico": { + "version": "0.4.1", + "resolved": "https://registry.npmjs.org/decode-ico/-/decode-ico-0.4.1.tgz", + "integrity": "sha512-69NZfbKIzux1vBOd31al3XnMnH+2mqDhEgLdpygErm4d60N+UwA5Sq5WFjmEDQzumgB9fElojGwWG0vybVfFmA==", "dev": true, "license": "MIT", "dependencies": { - "cose-base": "^1.0.0" + "@canvas/image-data": "^1.0.0", + "decode-bmp": "^0.2.0", + "to-data-view": "^1.1.0" }, - "peerDependencies": { - "cytoscape": "^3.2.0" + "engines": { + "node": ">=8.6" } }, - "node_modules/cytoscape-fcose": { - "version": "2.2.0", - "resolved": "https://registry.npmjs.org/cytoscape-fcose/-/cytoscape-fcose-2.2.0.tgz", - "integrity": "sha512-ki1/VuRIHFCzxWNrsshHYPs6L7TvLu3DL+TyIGEsRcvVERmxokbf5Gdk7mFxZnTdiGtnA4cfSmjZJMviqSuZrQ==", + "node_modules/decode-named-character-reference": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/decode-named-character-reference/-/decode-named-character-reference-1.3.0.tgz", + "integrity": "sha512-GtpQYB283KrPp6nRw50q3U9/VfOutZOe103qlN7BPP6Ad27xYnOIWv4lPzo8HCAL+mMZofJ9KEy30fq6MfaK6Q==", "dev": true, "license": "MIT", "dependencies": { - "cose-base": "^2.2.0" + "character-entities": "^2.0.0" }, - "peerDependencies": { - "cytoscape": "^3.2.0" + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" } }, - "node_modules/cytoscape-fcose/node_modules/cose-base": { - "version": "2.2.0", - "resolved": "https://registry.npmjs.org/cose-base/-/cose-base-2.2.0.tgz", - "integrity": "sha512-AzlgcsCbUMymkADOJtQm3wO9S3ltPfYOFD5033keQn9NJzIbtnZj+UdBJe7DYml/8TdbtHJW3j58SOnKhWY/5g==", + "node_modules/dedent": { + "version": "1.7.2", + "resolved": "https://registry.npmjs.org/dedent/-/dedent-1.7.2.tgz", + "integrity": "sha512-WzMx3mW98SN+zn3hgemf4OzdmyNhhhKz5Ay0pUfQiMQ3e1g+xmTJWp/pKdwKVXhdSkAEGIIzqeuWrL3mV/AXbA==", "dev": true, "license": "MIT", - "dependencies": { - "layout-base": "^2.0.0" + "peerDependencies": { + "babel-plugin-macros": "^3.1.0" + }, + "peerDependenciesMeta": { + "babel-plugin-macros": { + "optional": true + } } }, - "node_modules/cytoscape-fcose/node_modules/layout-base": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/layout-base/-/layout-base-2.0.1.tgz", - "integrity": "sha512-dp3s92+uNI1hWIpPGH3jK2kxE2lMjdXdr+DH8ynZHpd6PUlH6x6cbuXnoMmiNumznqaNO31xu9e79F0uuZ0JFg==", + "node_modules/dedent-js": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/dedent-js/-/dedent-js-1.0.1.tgz", + "integrity": "sha512-OUepMozQULMLUmhxS95Vudo0jb0UchLimi3+pQ2plj61Fcy8axbP9hbiD4Sz6DPqn6XG3kfmziVfQ1rSys5AJQ==", "dev": true, "license": "MIT" }, - "node_modules/d3": { - "version": "7.9.0", - "resolved": "https://registry.npmjs.org/d3/-/d3-7.9.0.tgz", - "integrity": "sha512-e1U46jVP+w7Iut8Jt8ri1YsPOvFpg46k+K8TpCb0P+zjCkjkPnV7WzfDJzMHy1LnA+wj5pLT1wjO901gLXeEhA==", + "node_modules/deep-eql": { + "version": "5.0.2", + "resolved": "https://registry.npmjs.org/deep-eql/-/deep-eql-5.0.2.tgz", + "integrity": "sha512-h5k/5U50IJJFpzfL6nO9jaaumfjO/f2NjK/oYB2Djzm4p9L+3T9qWpZqZ2hAbLPuuYq9wrU08WQyBTL5GbPk5Q==", "dev": true, - "license": "ISC", - "dependencies": { - "d3-array": "3", - "d3-axis": "3", - "d3-brush": "3", - "d3-chord": "3", - "d3-color": "3", - "d3-contour": "4", - "d3-delaunay": "6", - "d3-dispatch": "3", - "d3-drag": "3", - "d3-dsv": "3", - "d3-ease": "3", - "d3-fetch": "3", - "d3-force": "3", - "d3-format": "3", - "d3-geo": "3", - "d3-hierarchy": "3", - "d3-interpolate": "3", - "d3-path": "3", - "d3-polygon": "3", - "d3-quadtree": "3", - "d3-random": "3", - "d3-scale": "4", - "d3-scale-chromatic": "3", - "d3-selection": "3", - "d3-shape": "3", - "d3-time": "3", - "d3-time-format": "4", - "d3-timer": "3", - "d3-transition": "3", - "d3-zoom": "3" - }, + "license": "MIT", "engines": { - "node": ">=12" + "node": ">=6" } }, - "node_modules/d3-array": { - "version": "3.2.4", - "resolved": "https://registry.npmjs.org/d3-array/-/d3-array-3.2.4.tgz", - "integrity": "sha512-tdQAmyA18i4J7wprpYq8ClcxZy3SC31QMeByyCFyRt7BVHdREQZ5lpzoe5mFEYZUWe+oq8HBvk9JjpibyEV4Jg==", + "node_modules/deep-is": { + "version": "0.1.4", + "resolved": "https://registry.npmjs.org/deep-is/-/deep-is-0.1.4.tgz", + "integrity": "sha512-oIPzksmTg4/MriiaYGO+okXDT7ztn/w3Eptv/+gSIdMdKsJo0u4CfYNFJPy+4SKMuCqGw2wxnA+URMg3t8a/bQ==", "dev": true, - "license": "ISC", - "dependencies": { - "internmap": "1 - 2" - }, - "engines": { - "node": ">=12" - } + "license": "MIT" }, - "node_modules/d3-axis": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/d3-axis/-/d3-axis-3.0.0.tgz", - "integrity": "sha512-IH5tgjV4jE/GhHkRV0HiVYPDtvfjHQlQfJHs0usq7M30XcSBvOotpmH1IgkcXsO/5gEQZD43B//fc7SRT5S+xw==", + "node_modules/deepmerge": { + "version": "4.3.1", + "resolved": "https://registry.npmjs.org/deepmerge/-/deepmerge-4.3.1.tgz", + "integrity": "sha512-3sUqbMEc77XqpdNO7FRyRog+eW3ph+GYCbj+rK+uYyRMuwsVy0rMiVtPn+QJlKFvWP/1PYpapqYn0Me2knFn+A==", "dev": true, - "license": "ISC", + "license": "MIT", "engines": { - "node": ">=12" + "node": ">=0.10.0" } }, - "node_modules/d3-brush": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/d3-brush/-/d3-brush-3.0.0.tgz", - "integrity": "sha512-ALnjWlVYkXsVIGlOsuWH1+3udkYFI48Ljihfnh8FZPF2QS9o+PzGLBslO0PjzVoHLZ2KCVgAM8NVkXPJB2aNnQ==", + "node_modules/default-browser": { + "version": "5.5.0", + "resolved": "https://registry.npmjs.org/default-browser/-/default-browser-5.5.0.tgz", + "integrity": "sha512-H9LMLr5zwIbSxrmvikGuI/5KGhZ8E2zH3stkMgM5LpOWDutGM2JZaj460Udnf1a+946zc7YBgrqEWwbk7zHvGw==", "dev": true, - "license": "ISC", + "license": "MIT", "dependencies": { - "d3-dispatch": "1 - 3", - "d3-drag": "2 - 3", - "d3-interpolate": "1 - 3", - "d3-selection": "3", - "d3-transition": "3" + "bundle-name": "^4.1.0", + "default-browser-id": "^5.0.0" + }, + "engines": { + "node": ">=18" }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/default-browser-id": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/default-browser-id/-/default-browser-id-5.0.1.tgz", + "integrity": "sha512-x1VCxdX4t+8wVfd1so/9w+vQ4vx7lKd2Qp5tDRutErwmR85OgmfX7RlLRMWafRMY7hbEiXIbudNrjOAPa/hL8Q==", + "dev": true, + "license": "MIT", "engines": { - "node": ">=12" + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" } }, - "node_modules/d3-chord": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/d3-chord/-/d3-chord-3.0.1.tgz", - "integrity": "sha512-VE5S6TNa+j8msksl7HwjxMHDM2yNK3XCkusIlpX5kwauBfXuyLAtNg9jCp/iHH61tgI4sb6R/EIMWCqEIdjT/g==", + "node_modules/define-data-property": { + "version": "1.1.4", + "resolved": "https://registry.npmjs.org/define-data-property/-/define-data-property-1.1.4.tgz", + "integrity": "sha512-rBMvIzlpA8v6E+SJZoo++HAYqsLrkg7MSfIinMPFhmkorw7X+dOXVJQs+QT69zGkzMyfDnIMN2Wid1+NbL3T+A==", "dev": true, - "license": "ISC", + "license": "MIT", "dependencies": { - "d3-path": "1 - 3" + "es-define-property": "^1.0.0", + "es-errors": "^1.3.0", + "gopd": "^1.0.1" }, "engines": { - "node": ">=12" + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/d3-color": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/d3-color/-/d3-color-3.1.0.tgz", - "integrity": "sha512-zg/chbXyeBtMQ1LbD/WSoW2DpC3I0mpmPdW+ynRTj/x2DAWYrIY7qeZIHidozwV24m4iavr15lNwIwLxRmOxhA==", + "node_modules/define-lazy-prop": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/define-lazy-prop/-/define-lazy-prop-3.0.0.tgz", + "integrity": "sha512-N+MeXYoqr3pOgn8xfyRPREN7gHakLYjhsHhWGT3fWAiL4IkAt0iDw14QiiEm2bE30c5XX5q0FtAA3CK5f9/BUg==", "dev": true, - "license": "ISC", + "license": "MIT", "engines": { "node": ">=12" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" } }, - "node_modules/d3-contour": { - "version": "4.0.2", - "resolved": "https://registry.npmjs.org/d3-contour/-/d3-contour-4.0.2.tgz", - "integrity": "sha512-4EzFTRIikzs47RGmdxbeUvLWtGedDUNkTcmzoeyg4sP/dvCexO47AaQL7VKy/gul85TOxw+IBgA8US2xwbToNA==", + "node_modules/define-properties": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/define-properties/-/define-properties-1.2.1.tgz", + "integrity": "sha512-8QmQKqEASLd5nx0U1B1okLElbUuuttJ/AnYmRXbbbGDWh6uS208EjD4Xqq/I9wK7u0v6O08XhTWnt5XtEbR6Dg==", "dev": true, - "license": "ISC", + "license": "MIT", "dependencies": { - "d3-array": "^3.2.0" + "define-data-property": "^1.0.1", + "has-property-descriptors": "^1.0.0", + "object-keys": "^1.1.1" }, "engines": { - "node": ">=12" + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/d3-delaunay": { - "version": "6.0.4", - "resolved": "https://registry.npmjs.org/d3-delaunay/-/d3-delaunay-6.0.4.tgz", - "integrity": "sha512-mdjtIZ1XLAM8bm/hx3WwjfHt6Sggek7qH043O8KEjDXN40xi3vx/6pYSVTwLjEgiXQTbvaouWKynLBiUZ6SK6A==", + "node_modules/defu": { + "version": "6.1.7", + "resolved": "https://registry.npmjs.org/defu/-/defu-6.1.7.tgz", + "integrity": "sha512-7z22QmUWiQ/2d0KkdYmANbRUVABpZ9SNYyH5vx6PZ+nE5bcC0l7uFvEfHlyld/HcGBFTL536ClDt3DEcSlEJAQ==", + "dev": true, + "license": "MIT" + }, + "node_modules/delaunator": { + "version": "5.1.0", + "resolved": "https://registry.npmjs.org/delaunator/-/delaunator-5.1.0.tgz", + "integrity": "sha512-AGrQ4QSgssa1NGmWmLPqN5NY2KajF5MqxetNEO+o0n3ZwZZeTmt7bBnvzHWrmkZFxGgr4HdyFgelzgi06otLuQ==", "dev": true, "license": "ISC", "dependencies": { - "delaunator": "5" - }, + "robust-predicates": "^3.0.2" + } + }, + "node_modules/depd": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/depd/-/depd-2.0.0.tgz", + "integrity": "sha512-g7nH6P6dyDioJogAAGprGpCtVImJhpPk/roCzdb3fIh61/s/nPsfR6onyMwkCAR/OlC3yBC0lESvUoQEAssIrw==", + "dev": true, + "license": "MIT", "engines": { - "node": ">=12" + "node": ">= 0.8" } }, - "node_modules/d3-dispatch": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/d3-dispatch/-/d3-dispatch-3.0.1.tgz", - "integrity": "sha512-rzUyPU/S7rwUflMyLc1ETDeBj0NRuHKKAcvukozwhshr6g6c5d8zh4c2gQjY2bZ0dXeGLWc1PF174P2tVvKhfg==", + "node_modules/dequal": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/dequal/-/dequal-2.0.3.tgz", + "integrity": "sha512-0je+qPKHEMohvfRTCEo3CrPG6cAzAYgmzKyxRiYSSDkS6eGJdyVJm7WaYA5ECaAD9wLB2T4EEeymA5aFVcYXCA==", "dev": true, - "license": "ISC", + "license": "MIT", "engines": { - "node": ">=12" + "node": ">=6" } }, - "node_modules/d3-drag": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/d3-drag/-/d3-drag-3.0.0.tgz", - "integrity": "sha512-pWbUJLdETVA8lQNJecMxoXfH6x+mO2UQo8rSmZ+QqxcbyA3hfeprFgIT//HW2nlHChWeIIMwS2Fq+gEARkhTkg==", + "node_modules/detect-libc": { + "version": "2.1.2", + "resolved": "https://registry.npmjs.org/detect-libc/-/detect-libc-2.1.2.tgz", + "integrity": "sha512-Btj2BOOO83o3WyH59e8MgXsxEQVcarkUOpEYrubB0urwnN10yQ364rsiByU11nZlqWYZm05i/of7io4mzihBtQ==", "dev": true, - "license": "ISC", - "dependencies": { - "d3-dispatch": "1 - 3", - "d3-selection": "3" - }, + "license": "Apache-2.0", "engines": { - "node": ">=12" + "node": ">=8" } }, - "node_modules/d3-dsv": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/d3-dsv/-/d3-dsv-3.0.1.tgz", - "integrity": "sha512-UG6OvdI5afDIFP9w4G0mNq50dSOsXHJaRE8arAS5o9ApWnIElp8GZw1Dun8vP8OyHOZ/QJUKUJwxiiCCnUwm+Q==", + "node_modules/devalue": { + "version": "5.8.1", + "resolved": "https://registry.npmjs.org/devalue/-/devalue-5.8.1.tgz", + "integrity": "sha512-4CXDYRBGqN+57wVJkuXBYmpAVUSg3L6JAQa/DFqm238G73E1wuyc/JhGQJzN7vUf/CMphYau2zXbfWzDR5aTEw==", "dev": true, - "license": "ISC", + "license": "MIT" + }, + "node_modules/devlop": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/devlop/-/devlop-1.1.0.tgz", + "integrity": "sha512-RWmIqhcFf1lRYBvNmr7qTNuyCt/7/ns2jbpp1+PalgE/rDQcBT0fioSMUpJ93irlUhC5hrg4cYqe6U+0ImW0rA==", + "dev": true, + "license": "MIT", "dependencies": { - "commander": "7", - "iconv-lite": "0.6", - "rw": "1" - }, - "bin": { - "csv2json": "bin/dsv2json.js", - "csv2tsv": "bin/dsv2dsv.js", - "dsv2dsv": "bin/dsv2dsv.js", - "dsv2json": "bin/dsv2json.js", - "json2csv": "bin/json2dsv.js", - "json2dsv": "bin/json2dsv.js", - "json2tsv": "bin/json2dsv.js", - "tsv2csv": "bin/dsv2dsv.js", - "tsv2json": "bin/dsv2json.js" + "dequal": "^2.0.0" }, - "engines": { - "node": ">=12" + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" } }, - "node_modules/d3-dsv/node_modules/commander": { - "version": "7.2.0", - "resolved": "https://registry.npmjs.org/commander/-/commander-7.2.0.tgz", - "integrity": "sha512-QrWXB+ZQSVPmIWIhtEO9H+gwHaMGYiF5ChvoJ+K9ZGHG/sVsa6yiesAD1GC/x46sET00Xlwo1u49RVVVzvcSkw==", + "node_modules/dexie": { + "version": "4.4.3", + "resolved": "https://registry.npmjs.org/dexie/-/dexie-4.4.3.tgz", + "integrity": "sha512-N+3IGQ3HPlyO2YAkntGAwitm42BpBGV86MttzUMiRzWLa4NGh0pltVRcUVF4ybL/OnXjCrr9k7SDPIKkFYP2Lg==", + "dev": true, + "license": "Apache-2.0" + }, + "node_modules/dom-accessibility-api": { + "version": "0.5.16", + "resolved": "https://registry.npmjs.org/dom-accessibility-api/-/dom-accessibility-api-0.5.16.tgz", + "integrity": "sha512-X7BJ2yElsnOJ30pZF4uIIDfBEVgF4XEBxL9Bxhy6dnrm5hkzqmsWHGTiHqRiITNhMyFLyAiWndIJP7Z1NTteDg==", "dev": true, "license": "MIT", - "engines": { - "node": ">= 10" - } + "peer": true }, - "node_modules/d3-ease": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/d3-ease/-/d3-ease-3.0.1.tgz", - "integrity": "sha512-wR/XK3D3XcLIZwpbvQwQ5fK+8Ykds1ip7A2Txe0yxncXSdq1L9skcG7blcedkOX+ZcgxGAmLX1FrRGbADwzi0w==", + "node_modules/dompurify": { + "version": "3.4.11", + "resolved": "https://registry.npmjs.org/dompurify/-/dompurify-3.4.11.tgz", + "integrity": "sha512-zhlUV12GsaRzMsf9q5M254YhA4+VuF0fG+QFqu6aYpoGlKtz+w8//jBcGVYBgQkR5GHjUomejY84AV+/uPbWdw==", "dev": true, - "license": "BSD-3-Clause", - "engines": { - "node": ">=12" + "license": "(MPL-2.0 OR Apache-2.0)", + "optionalDependencies": { + "@types/trusted-types": "^2.0.7" } }, - "node_modules/d3-fetch": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/d3-fetch/-/d3-fetch-3.0.1.tgz", - "integrity": "sha512-kpkQIM20n3oLVBKGg6oHrUchHM3xODkTzjMoj7aWQFq5QEM+R6E4WkzT5+tojDY7yjez8KgCBRoj4aEr99Fdqw==", + "node_modules/dunder-proto": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz", + "integrity": "sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A==", "dev": true, - "license": "ISC", + "license": "MIT", "dependencies": { - "d3-dsv": "1 - 3" + "call-bind-apply-helpers": "^1.0.1", + "es-errors": "^1.3.0", + "gopd": "^1.2.0" }, "engines": { - "node": ">=12" + "node": ">= 0.4" } }, - "node_modules/d3-force": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/d3-force/-/d3-force-3.0.0.tgz", - "integrity": "sha512-zxV/SsA+U4yte8051P4ECydjD/S+qeYtnaIyAs9tgHCqfguma/aAQDjo85A9Z6EKhBirHRJHXIgJUlffT4wdLg==", + "node_modules/ee-first": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/ee-first/-/ee-first-1.1.1.tgz", + "integrity": "sha512-WMwm9LhRUo+WUaRN+vRuETqG89IgZphVSNkdFgeb6sS/E4OrDIN7t48CAewSHXc6C8lefD8KKfr5vY61brQlow==", "dev": true, - "license": "ISC", + "license": "MIT" + }, + "node_modules/ejs": { + "version": "3.1.10", + "resolved": "https://registry.npmjs.org/ejs/-/ejs-3.1.10.tgz", + "integrity": "sha512-UeJmFfOrAQS8OJWPZ4qtgHyWExa088/MtK5UEyoJGFH67cDEXkZSviOiKRCZ4Xij0zxI3JECgYs3oKx+AizQBA==", + "dev": true, + "license": "Apache-2.0", "dependencies": { - "d3-dispatch": "1 - 3", - "d3-quadtree": "1 - 3", - "d3-timer": "1 - 3" + "jake": "^10.8.5" + }, + "bin": { + "ejs": "bin/cli.js" }, "engines": { - "node": ">=12" + "node": ">=0.10.0" } }, - "node_modules/d3-format": { - "version": "3.1.2", - "resolved": "https://registry.npmjs.org/d3-format/-/d3-format-3.1.2.tgz", - "integrity": "sha512-AJDdYOdnyRDV5b6ArilzCPPwc1ejkHcoyFarqlPqT7zRYjhavcT3uSrqcMvsgh2CgoPbK3RCwyHaVyxYcP2Arg==", + "node_modules/electron-to-chromium": { + "version": "1.5.367", + "resolved": "https://registry.npmjs.org/electron-to-chromium/-/electron-to-chromium-1.5.367.tgz", + "integrity": "sha512-4Mk/mrynCNQ+atY40D3UpmhLWB6AHMbYMlIrPhHcMF6x0L7O0b052FCAsxw1LlaR++UFuNg3D/A6XCuGDa0guQ==", "dev": true, - "license": "ISC", + "license": "ISC" + }, + "node_modules/encodeurl": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-2.0.0.tgz", + "integrity": "sha512-Q0n9HRi4m6JuGIV1eFlmvJB7ZEVxu93IrMyiMsGC0lrMJMWzRgx6WGquyfQgZVb31vhGgXnfmPNNXmxnOkRBrg==", + "dev": true, + "license": "MIT", "engines": { - "node": ">=12" + "node": ">= 0.8" } }, - "node_modules/d3-geo": { - "version": "3.1.1", - "resolved": "https://registry.npmjs.org/d3-geo/-/d3-geo-3.1.1.tgz", - "integrity": "sha512-637ln3gXKXOwhalDzinUgY83KzNWZRKbYubaG+fGVuc/dxO64RRljtCTnf5ecMyE1RIdtqpkVcq0IbtU2S8j2Q==", + "node_modules/enhanced-resolve": { + "version": "5.23.0", + "resolved": "https://registry.npmjs.org/enhanced-resolve/-/enhanced-resolve-5.23.0.tgz", + "integrity": "sha512-yJN/BOOLxcOW2aQgeif9mSnaUB8KtvmMMp56oA1kx1CRfBKbhZm2pJ+NBY+3eOboHxix8lfjWpHE0Ei5U8RbSA==", "dev": true, - "license": "ISC", + "license": "MIT", "dependencies": { - "d3-array": "2.5.0 - 3" + "graceful-fs": "^4.2.4", + "tapable": "^2.3.3" }, "engines": { - "node": ">=12" + "node": ">=10.13.0" } }, - "node_modules/d3-hierarchy": { - "version": "3.1.2", - "resolved": "https://registry.npmjs.org/d3-hierarchy/-/d3-hierarchy-3.1.2.tgz", - "integrity": "sha512-FX/9frcub54beBdugHjDCdikxThEqjnR93Qt7PvQTOHxyiNCAlvMrHhclk3cD5VeAaq9fxmfRp+CnWw9rEMBuA==", + "node_modules/entities": { + "version": "6.0.1", + "resolved": "https://registry.npmjs.org/entities/-/entities-6.0.1.tgz", + "integrity": "sha512-aN97NXWF6AWBTahfVOIrB/NShkzi5H7F9r1s9mD3cDj4Ko5f2qhhVoYMibXF7GlLveb/D2ioWay8lxI97Ven3g==", "dev": true, - "license": "ISC", + "license": "BSD-2-Clause", "engines": { - "node": ">=12" + "node": ">=0.12" + }, + "funding": { + "url": "https://github.com/fb55/entities?sponsor=1" } }, - "node_modules/d3-interpolate": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/d3-interpolate/-/d3-interpolate-3.0.1.tgz", - "integrity": "sha512-3bYs1rOD33uo8aqJfKP3JWPAibgw8Zm2+L9vBKEHJ2Rg+viTR7o5Mmv5mZcieN+FRYaAOWX5SJATX6k1PWz72g==", + "node_modules/es-abstract": { + "version": "1.24.2", + "resolved": "https://registry.npmjs.org/es-abstract/-/es-abstract-1.24.2.tgz", + "integrity": "sha512-2FpH9Q5i2RRwyEP1AylXe6nYLR5OhaJTZwmlcP0dL/+JCbgg7yyEo/sEK6HeGZRf3dFpWwThaRHVApXSkW3xeg==", "dev": true, - "license": "ISC", + "license": "MIT", "dependencies": { - "d3-color": "1 - 3" + "array-buffer-byte-length": "^1.0.2", + "arraybuffer.prototype.slice": "^1.0.4", + "available-typed-arrays": "^1.0.7", + "call-bind": "^1.0.8", + "call-bound": "^1.0.4", + "data-view-buffer": "^1.0.2", + "data-view-byte-length": "^1.0.2", + "data-view-byte-offset": "^1.0.1", + "es-define-property": "^1.0.1", + "es-errors": "^1.3.0", + "es-object-atoms": "^1.1.1", + "es-set-tostringtag": "^2.1.0", + "es-to-primitive": "^1.3.0", + "function.prototype.name": "^1.1.8", + "get-intrinsic": "^1.3.0", + "get-proto": "^1.0.1", + "get-symbol-description": "^1.1.0", + "globalthis": "^1.0.4", + "gopd": "^1.2.0", + "has-property-descriptors": "^1.0.2", + "has-proto": "^1.2.0", + "has-symbols": "^1.1.0", + "hasown": "^2.0.2", + "internal-slot": "^1.1.0", + "is-array-buffer": "^3.0.5", + "is-callable": "^1.2.7", + "is-data-view": "^1.0.2", + "is-negative-zero": "^2.0.3", + "is-regex": "^1.2.1", + "is-set": "^2.0.3", + "is-shared-array-buffer": "^1.0.4", + "is-string": "^1.1.1", + "is-typed-array": "^1.1.15", + "is-weakref": "^1.1.1", + "math-intrinsics": "^1.1.0", + "object-inspect": "^1.13.4", + "object-keys": "^1.1.1", + "object.assign": "^4.1.7", + "own-keys": "^1.0.1", + "regexp.prototype.flags": "^1.5.4", + "safe-array-concat": "^1.1.3", + "safe-push-apply": "^1.0.0", + "safe-regex-test": "^1.1.0", + "set-proto": "^1.0.0", + "stop-iteration-iterator": "^1.1.0", + "string.prototype.trim": "^1.2.10", + "string.prototype.trimend": "^1.0.9", + "string.prototype.trimstart": "^1.0.8", + "typed-array-buffer": "^1.0.3", + "typed-array-byte-length": "^1.0.3", + "typed-array-byte-offset": "^1.0.4", + "typed-array-length": "^1.0.7", + "unbox-primitive": "^1.1.0", + "which-typed-array": "^1.1.19" }, "engines": { - "node": ">=12" + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/d3-path": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/d3-path/-/d3-path-3.1.0.tgz", - "integrity": "sha512-p3KP5HCf/bvjBSSKuXid6Zqijx7wIfNW+J/maPs+iwR35at5JCbLUT0LzF1cnjbCHWhqzQTIN2Jpe8pRebIEFQ==", + "node_modules/es-define-property": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/es-define-property/-/es-define-property-1.0.1.tgz", + "integrity": "sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g==", "dev": true, - "license": "ISC", + "license": "MIT", "engines": { - "node": ">=12" + "node": ">= 0.4" } }, - "node_modules/d3-polygon": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/d3-polygon/-/d3-polygon-3.0.1.tgz", - "integrity": "sha512-3vbA7vXYwfe1SYhED++fPUQlWSYTTGmFmQiany/gdbiWgU/iEyQzyymwL9SkJjFFuCS4902BSzewVGsHHmHtXg==", + "node_modules/es-errors": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/es-errors/-/es-errors-1.3.0.tgz", + "integrity": "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw==", "dev": true, - "license": "ISC", + "license": "MIT", "engines": { - "node": ">=12" + "node": ">= 0.4" } }, - "node_modules/d3-quadtree": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/d3-quadtree/-/d3-quadtree-3.0.1.tgz", - "integrity": "sha512-04xDrxQTDTCFwP5H6hRhsRcb9xxv2RzkcsygFzmkSIOJy3PeRJP7sNk3VRIbKXcog561P9oU0/rVH6vDROAgUw==", + "node_modules/es-module-lexer": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/es-module-lexer/-/es-module-lexer-2.1.0.tgz", + "integrity": "sha512-n27zTYMjYu1aj4MjCWzSP7G9r75utsaoc8m61weK+W8JMBGGQybd43GstCXZ3WNmSFtGT9wi59qQTW6mhTR5LQ==", "dev": true, - "license": "ISC", - "engines": { - "node": ">=12" - } + "license": "MIT" }, - "node_modules/d3-random": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/d3-random/-/d3-random-3.0.1.tgz", - "integrity": "sha512-FXMe9GfxTxqd5D6jFsQ+DJ8BJS4E/fT5mqqdjovykEB2oFbTMDVdg1MGFxfQW+FBOGoB++k8swBrgwSHT1cUXQ==", + "node_modules/es-object-atoms": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/es-object-atoms/-/es-object-atoms-1.1.2.tgz", + "integrity": "sha512-HWcBoN6NileqtSydK2FqHbS/LoDd2pqrnQHLyJzBj4kOp/ky2MWMN694xOfkK8/SnUsW2DH7EfyVlydKCsm1Zw==", "dev": true, - "license": "ISC", + "license": "MIT", + "dependencies": { + "es-errors": "^1.3.0" + }, "engines": { - "node": ">=12" + "node": ">= 0.4" } }, - "node_modules/d3-sankey": { - "version": "0.12.3", - "resolved": "https://registry.npmjs.org/d3-sankey/-/d3-sankey-0.12.3.tgz", - "integrity": "sha512-nQhsBRmM19Ax5xEIPLMY9ZmJ/cDvd1BG3UVvt5h3WRxKg5zGRbvnteTyWAbzeSvlh3tW7ZEmq4VwR5mB3tutmQ==", + "node_modules/es-set-tostringtag": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/es-set-tostringtag/-/es-set-tostringtag-2.1.0.tgz", + "integrity": "sha512-j6vWzfrGVfyXxge+O0x5sh6cvxAog0a/4Rdd2K36zCMV5eJ+/+tOAngRO8cODMNWbVRdVlmGZQL2YS3yR8bIUA==", "dev": true, - "license": "BSD-3-Clause", + "license": "MIT", "dependencies": { - "d3-array": "1 - 2", - "d3-shape": "^1.2.0" + "es-errors": "^1.3.0", + "get-intrinsic": "^1.2.6", + "has-tostringtag": "^1.0.2", + "hasown": "^2.0.2" + }, + "engines": { + "node": ">= 0.4" } }, - "node_modules/d3-sankey/node_modules/d3-array": { - "version": "2.12.1", - "resolved": "https://registry.npmjs.org/d3-array/-/d3-array-2.12.1.tgz", - "integrity": "sha512-B0ErZK/66mHtEsR1TkPEEkwdy+WDesimkM5gpZr5Dsg54BiTA5RXtYW5qTLIAcekaS9xfZrzBLF/OAkB3Qn1YQ==", + "node_modules/es-to-primitive": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/es-to-primitive/-/es-to-primitive-1.3.0.tgz", + "integrity": "sha512-w+5mJ3GuFL+NjVtJlvydShqE1eN3h3PbI7/5LAsYJP/2qtuMXjfL2LpHSRqo4b4eSF5K/DH1JXKUAHSB2UW50g==", "dev": true, - "license": "BSD-3-Clause", + "license": "MIT", "dependencies": { - "internmap": "^1.0.0" + "is-callable": "^1.2.7", + "is-date-object": "^1.0.5", + "is-symbol": "^1.0.4" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/d3-sankey/node_modules/d3-path": { - "version": "1.0.9", - "resolved": "https://registry.npmjs.org/d3-path/-/d3-path-1.0.9.tgz", - "integrity": "sha512-VLaYcn81dtHVTjEHd8B+pbe9yHWpXKZUC87PzoFmsFrJqgFwDe/qxfp5MlfsfM1V5E/iVt0MmEbWQ7FVIXh/bg==", + "node_modules/es-toolkit": { + "version": "1.47.0", + "resolved": "https://registry.npmjs.org/es-toolkit/-/es-toolkit-1.47.0.tgz", + "integrity": "sha512-n1GuoD0WEQZMBk5tttoZSqwgyLx01oqa5XsBmCHwPyNe1S9jPBEmtR2pSgp2kJuWE3ciFZ6yRHmY4pM4C3OOkw==", "dev": true, - "license": "BSD-3-Clause" + "license": "MIT", + "workspaces": [ + "docs", + "benchmarks" + ] }, - "node_modules/d3-sankey/node_modules/d3-shape": { - "version": "1.3.7", - "resolved": "https://registry.npmjs.org/d3-shape/-/d3-shape-1.3.7.tgz", - "integrity": "sha512-EUkvKjqPFUAZyOlhY5gzCxCeI0Aep04LwIRpsZ/mLFelJiUfnK56jo5JMDSE7yyP2kLSb6LtF+S5chMk7uqPqw==", + "node_modules/esbuild": { + "version": "0.27.7", + "resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.27.7.tgz", + "integrity": "sha512-IxpibTjyVnmrIQo5aqNpCgoACA/dTKLTlhMHihVHhdkxKyPO1uBBthumT0rdHmcsk9uMonIWS0m4FljWzILh3w==", "dev": true, - "license": "BSD-3-Clause", - "dependencies": { - "d3-path": "1" + "hasInstallScript": true, + "license": "MIT", + "bin": { + "esbuild": "bin/esbuild" + }, + "engines": { + "node": ">=18" + }, + "optionalDependencies": { + "@esbuild/aix-ppc64": "0.27.7", + "@esbuild/android-arm": "0.27.7", + "@esbuild/android-arm64": "0.27.7", + "@esbuild/android-x64": "0.27.7", + "@esbuild/darwin-arm64": "0.27.7", + "@esbuild/darwin-x64": "0.27.7", + "@esbuild/freebsd-arm64": "0.27.7", + "@esbuild/freebsd-x64": "0.27.7", + "@esbuild/linux-arm": "0.27.7", + "@esbuild/linux-arm64": "0.27.7", + "@esbuild/linux-ia32": "0.27.7", + "@esbuild/linux-loong64": "0.27.7", + "@esbuild/linux-mips64el": "0.27.7", + "@esbuild/linux-ppc64": "0.27.7", + "@esbuild/linux-riscv64": "0.27.7", + "@esbuild/linux-s390x": "0.27.7", + "@esbuild/linux-x64": "0.27.7", + "@esbuild/netbsd-arm64": "0.27.7", + "@esbuild/netbsd-x64": "0.27.7", + "@esbuild/openbsd-arm64": "0.27.7", + "@esbuild/openbsd-x64": "0.27.7", + "@esbuild/openharmony-arm64": "0.27.7", + "@esbuild/sunos-x64": "0.27.7", + "@esbuild/win32-arm64": "0.27.7", + "@esbuild/win32-ia32": "0.27.7", + "@esbuild/win32-x64": "0.27.7" } }, - "node_modules/d3-sankey/node_modules/internmap": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/internmap/-/internmap-1.0.1.tgz", - "integrity": "sha512-lDB5YccMydFBtasVtxnZ3MRBHuaoE8GKsppq+EchKL2U4nK/DmEpPHNH8MZe5HkMtpSiTSOZwfN0tzYjO/lJEw==", + "node_modules/escalade": { + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/escalade/-/escalade-3.2.0.tgz", + "integrity": "sha512-WUj2qlxaQtO4g6Pq5c29GTcWGDyd8itL8zTlipgECz3JesAiiOKotd8JU6otB3PACgG6xkJUyVhboMS+bje/jA==", "dev": true, - "license": "ISC" + "license": "MIT", + "engines": { + "node": ">=6" + } }, - "node_modules/d3-scale": { - "version": "4.0.2", - "resolved": "https://registry.npmjs.org/d3-scale/-/d3-scale-4.0.2.tgz", - "integrity": "sha512-GZW464g1SH7ag3Y7hXjf8RoUuAFIqklOAq3MRl4OaWabTFJY9PN/E1YklhXLh+OQ3fM9yS2nOkCoS+WLZ6kvxQ==", + "node_modules/escape-html": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/escape-html/-/escape-html-1.0.3.tgz", + "integrity": "sha512-NiSupZ4OeuGwr68lGIeym/ksIZMJodUGOSCZ/FSnTxcrekbvqrgdUxlJOMpijaKZVjAJrWrGs/6Jy8OMuyj9ow==", "dev": true, - "license": "ISC", - "dependencies": { - "d3-array": "2.10.0 - 3", - "d3-format": "1 - 3", - "d3-interpolate": "1.2.0 - 3", - "d3-time": "2.1.1 - 3", - "d3-time-format": "2 - 4" - }, + "license": "MIT" + }, + "node_modules/escape-string-regexp": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-4.0.0.tgz", + "integrity": "sha512-TtpcNJ3XAzx3Gq8sWRzJaVajRs0uVxA2YAkdb1jm2YkPz4G6egUFAyA3n5vtEIZefPk5Wa4UXbKuS5fKkJWdgA==", + "dev": true, + "license": "MIT", "engines": { - "node": ">=12" + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" } }, - "node_modules/d3-scale-chromatic": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/d3-scale-chromatic/-/d3-scale-chromatic-3.1.0.tgz", - "integrity": "sha512-A3s5PWiZ9YCXFye1o246KoscMWqf8BsD9eRiJ3He7C9OBaxKhAd5TFCdEx/7VbKtxxTsu//1mMJFrEt572cEyQ==", + "node_modules/eslint": { + "version": "9.39.4", + "resolved": "https://registry.npmjs.org/eslint/-/eslint-9.39.4.tgz", + "integrity": "sha512-XoMjdBOwe/esVgEvLmNsD3IRHkm7fbKIUGvrleloJXUZgDHig2IPWNniv+GwjyJXzuNqVjlr5+4yVUZjycJwfQ==", "dev": true, - "license": "ISC", + "license": "MIT", "dependencies": { - "d3-color": "1 - 3", - "d3-interpolate": "1 - 3" + "@eslint-community/eslint-utils": "^4.8.0", + "@eslint-community/regexpp": "^4.12.1", + "@eslint/config-array": "^0.21.2", + "@eslint/config-helpers": "^0.4.2", + "@eslint/core": "^0.17.0", + "@eslint/eslintrc": "^3.3.5", + "@eslint/js": "9.39.4", + "@eslint/plugin-kit": "^0.4.1", + "@humanfs/node": "^0.16.6", + "@humanwhocodes/module-importer": "^1.0.1", + "@humanwhocodes/retry": "^0.4.2", + "@types/estree": "^1.0.6", + "ajv": "^6.14.0", + "chalk": "^4.0.0", + "cross-spawn": "^7.0.6", + "debug": "^4.3.2", + "escape-string-regexp": "^4.0.0", + "eslint-scope": "^8.4.0", + "eslint-visitor-keys": "^4.2.1", + "espree": "^10.4.0", + "esquery": "^1.5.0", + "esutils": "^2.0.2", + "fast-deep-equal": "^3.1.3", + "file-entry-cache": "^8.0.0", + "find-up": "^5.0.0", + "glob-parent": "^6.0.2", + "ignore": "^5.2.0", + "imurmurhash": "^0.1.4", + "is-glob": "^4.0.0", + "json-stable-stringify-without-jsonify": "^1.0.1", + "lodash.merge": "^4.6.2", + "minimatch": "^3.1.5", + "natural-compare": "^1.4.0", + "optionator": "^0.9.3" + }, + "bin": { + "eslint": "bin/eslint.js" }, "engines": { - "node": ">=12" + "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + }, + "funding": { + "url": "https://eslint.org/donate" + }, + "peerDependencies": { + "jiti": "*" + }, + "peerDependenciesMeta": { + "jiti": { + "optional": true + } } }, - "node_modules/d3-selection": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/d3-selection/-/d3-selection-3.0.0.tgz", - "integrity": "sha512-fmTRWbNMmsmWq6xJV8D19U/gw/bwrHfNXxrIN+HfZgnzqTHp9jOmKMhsTUjXOJnZOdZY9Q28y4yebKzqDKlxlQ==", + "node_modules/eslint-config-prettier": { + "version": "10.1.8", + "resolved": "https://registry.npmjs.org/eslint-config-prettier/-/eslint-config-prettier-10.1.8.tgz", + "integrity": "sha512-82GZUjRS0p/jganf6q1rEO25VSoHH0hKPCTrgillPjdI/3bgBhAE1QzHrHTizjpRvy6pGAvKjDJtk2pF9NDq8w==", "dev": true, - "license": "ISC", - "engines": { - "node": ">=12" + "license": "MIT", + "bin": { + "eslint-config-prettier": "bin/cli.js" + }, + "funding": { + "url": "https://opencollective.com/eslint-config-prettier" + }, + "peerDependencies": { + "eslint": ">=7.0.0" } }, - "node_modules/d3-shape": { - "version": "3.2.0", - "resolved": "https://registry.npmjs.org/d3-shape/-/d3-shape-3.2.0.tgz", - "integrity": "sha512-SaLBuwGm3MOViRq2ABk3eLoxwZELpH6zhl3FbAoJ7Vm1gofKx6El1Ib5z23NUEhF9AsGl7y+dzLe5Cw2AArGTA==", + "node_modules/eslint-plugin-storybook": { + "version": "10.4.2", + "resolved": "https://registry.npmjs.org/eslint-plugin-storybook/-/eslint-plugin-storybook-10.4.2.tgz", + "integrity": "sha512-l3/vzLRmb8VSi3X1Bo6/Pa+64naw1jFsZE5jPPA4izvVdNhH1rF4rGuOC3kDTU926qKVBQtKua8D24XWQtvcGg==", "dev": true, - "license": "ISC", + "license": "MIT", "dependencies": { - "d3-path": "^3.1.0" + "@typescript-eslint/utils": "^8.48.0" }, - "engines": { - "node": ">=12" + "peerDependencies": { + "eslint": ">=8", + "storybook": "^10.4.2" } }, - "node_modules/d3-time": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/d3-time/-/d3-time-3.1.0.tgz", - "integrity": "sha512-VqKjzBLejbSMT4IgbmVgDjpkYrNWUYJnbCGo874u7MMKIWsILRX+OpX/gTk8MqjpT1A/c6HY2dCA77ZN0lkQ2Q==", + "node_modules/eslint-plugin-svelte": { + "version": "3.19.0", + "resolved": "https://registry.npmjs.org/eslint-plugin-svelte/-/eslint-plugin-svelte-3.19.0.tgz", + "integrity": "sha512-t3rNaZeXz4d2gG4uJyMEYfJCFKf22+SWbSizIIXIWKu4wM+XPLiMWuSSr/C5821JmFeN9ogK+eExbG+z+twyxw==", "dev": true, - "license": "ISC", + "license": "MIT", "dependencies": { - "d3-array": "2 - 3" + "@eslint-community/eslint-utils": "^4.6.1", + "@jridgewell/sourcemap-codec": "^1.5.0", + "esutils": "^2.0.3", + "globals": "^16.0.0", + "known-css-properties": "^0.37.0", + "postcss": "^8.4.49", + "postcss-load-config": "^3.1.4", + "postcss-safe-parser": "^7.0.0", + "semver": "^7.6.3", + "svelte-eslint-parser": "^1.7.0" }, "engines": { - "node": ">=12" + "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + }, + "funding": { + "url": "https://github.com/sponsors/ota-meshi" + }, + "peerDependencies": { + "eslint": "^8.57.1 || ^9.0.0 || ^10.0.0", + "svelte": "^3.37.0 || ^4.0.0 || ^5.0.0" + }, + "peerDependenciesMeta": { + "svelte": { + "optional": true + } } }, - "node_modules/d3-time-format": { - "version": "4.1.0", - "resolved": "https://registry.npmjs.org/d3-time-format/-/d3-time-format-4.1.0.tgz", - "integrity": "sha512-dJxPBlzC7NugB2PDLwo9Q8JiTR3M3e4/XANkreKSUxF8vvXKqm1Yfq4Q5dl8budlunRVlUUaDUgFt7eA8D6NLg==", + "node_modules/eslint-scope": { + "version": "8.4.0", + "resolved": "https://registry.npmjs.org/eslint-scope/-/eslint-scope-8.4.0.tgz", + "integrity": "sha512-sNXOfKCn74rt8RICKMvJS7XKV/Xk9kA7DyJr8mJik3S7Cwgy3qlkkmyS2uQB3jiJg6VNdZd/pDBJu0nvG2NlTg==", "dev": true, - "license": "ISC", + "license": "BSD-2-Clause", "dependencies": { - "d3-time": "1 - 3" + "esrecurse": "^4.3.0", + "estraverse": "^5.2.0" }, "engines": { - "node": ">=12" + "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + }, + "funding": { + "url": "https://opencollective.com/eslint" } }, - "node_modules/d3-timer": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/d3-timer/-/d3-timer-3.0.1.tgz", - "integrity": "sha512-ndfJ/JxxMd3nw31uyKoY2naivF+r29V+Lc0svZxe1JvvIRmi8hUsrMvdOwgS1o6uBHmiz91geQ0ylPP0aj1VUA==", + "node_modules/eslint-visitor-keys": { + "version": "4.2.1", + "resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-4.2.1.tgz", + "integrity": "sha512-Uhdk5sfqcee/9H/rCOJikYz67o0a2Tw2hGRPOG2Y1R2dg7brRe1uG0yaNQDHu+TO/uQPF/5eCapvYSmHUjt7JQ==", "dev": true, - "license": "ISC", + "license": "Apache-2.0", "engines": { - "node": ">=12" + "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + }, + "funding": { + "url": "https://opencollective.com/eslint" } }, - "node_modules/d3-transition": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/d3-transition/-/d3-transition-3.0.1.tgz", - "integrity": "sha512-ApKvfjsSR6tg06xrL434C0WydLr7JewBB3V+/39RMHsaXTOG0zmt/OAXeng5M5LBm0ojmxJrpomQVZ1aPvBL4w==", + "node_modules/eslint/node_modules/@eslint/js": { + "version": "9.39.4", + "resolved": "https://registry.npmjs.org/@eslint/js/-/js-9.39.4.tgz", + "integrity": "sha512-nE7DEIchvtiFTwBw4Lfbu59PG+kCofhjsKaCWzxTpt4lfRjRMqG6uMBzKXuEcyXhOHoUp9riAm7/aWYGhXZ9cw==", "dev": true, - "license": "ISC", - "dependencies": { - "d3-color": "1 - 3", - "d3-dispatch": "1 - 3", - "d3-ease": "1 - 3", - "d3-interpolate": "1 - 3", - "d3-timer": "1 - 3" - }, + "license": "MIT", "engines": { - "node": ">=12" + "node": "^18.18.0 || ^20.9.0 || >=21.1.0" }, - "peerDependencies": { - "d3-selection": "2 - 3" + "funding": { + "url": "https://eslint.org/donate" } }, - "node_modules/d3-zoom": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/d3-zoom/-/d3-zoom-3.0.0.tgz", - "integrity": "sha512-b8AmV3kfQaqWAuacbPuNbL6vahnOJflOhexLzMMNLga62+/nh0JzvJ0aO/5a5MVgUFGS7Hu1P9P03o3fJkDCyw==", + "node_modules/eslint/node_modules/ajv": { + "version": "6.15.0", + "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.15.0.tgz", + "integrity": "sha512-fgFx7Hfoq60ytK2c7DhnF8jIvzYgOMxfugjLOSMHjLIPgenqa7S7oaagATUq99mV6IYvN2tRmC0wnTYX6iPbMw==", "dev": true, - "license": "ISC", + "license": "MIT", "dependencies": { - "d3-dispatch": "1 - 3", - "d3-drag": "2 - 3", - "d3-interpolate": "1 - 3", - "d3-selection": "2 - 3", - "d3-transition": "2 - 3" + "fast-deep-equal": "^3.1.1", + "fast-json-stable-stringify": "^2.0.0", + "json-schema-traverse": "^0.4.1", + "uri-js": "^4.2.2" }, - "engines": { - "node": ">=12" + "funding": { + "type": "github", + "url": "https://github.com/sponsors/epoberezkin" } }, - "node_modules/dagre-d3-es": { - "version": "7.0.14", - "resolved": "https://registry.npmjs.org/dagre-d3-es/-/dagre-d3-es-7.0.14.tgz", - "integrity": "sha512-P4rFMVq9ESWqmOgK+dlXvOtLwYg0i7u0HBGJER0LZDJT2VHIPAMZ/riPxqJceWMStH5+E61QxFra9kIS3AqdMg==", + "node_modules/eslint/node_modules/json-schema-traverse": { + "version": "0.4.1", + "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-0.4.1.tgz", + "integrity": "sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg==", "dev": true, - "license": "MIT", - "dependencies": { - "d3": "^7.9.0", - "lodash-es": "^4.17.21" - } + "license": "MIT" }, - "node_modules/dayjs": { - "version": "1.11.21", - "resolved": "https://registry.npmjs.org/dayjs/-/dayjs-1.11.21.tgz", - "integrity": "sha512-98IT+HOahAisibz/yjKbzuOBwYcjJ7BCLPzARyHiyEBmRz4fatF+KPJszEHXsGYjUG234aH/cOjW1wwTbKUZlA==", + "node_modules/esm-env": { + "version": "1.2.2", + "resolved": "https://registry.npmjs.org/esm-env/-/esm-env-1.2.2.tgz", + "integrity": "sha512-Epxrv+Nr/CaL4ZcFGPJIYLWFom+YeV1DqMLHJoEd9SYRxNbaFruBwfEX/kkHUJf55j2+TUbmDcmuilbP1TmXHA==", "dev": true, "license": "MIT" }, - "node_modules/debug": { - "version": "4.4.3", - "resolved": "https://registry.npmjs.org/debug/-/debug-4.4.3.tgz", - "integrity": "sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA==", + "node_modules/espree": { + "version": "10.4.0", + "resolved": "https://registry.npmjs.org/espree/-/espree-10.4.0.tgz", + "integrity": "sha512-j6PAQ2uUr79PZhBjP5C5fhl8e39FmRnOjsD5lGnWrFU8i2G776tBK7+nP8KuQUTTyAZUwfQqXAgrVH5MbH9CYQ==", "dev": true, - "license": "MIT", + "license": "BSD-2-Clause", "dependencies": { - "ms": "^2.1.3" + "acorn": "^8.15.0", + "acorn-jsx": "^5.3.2", + "eslint-visitor-keys": "^4.2.1" }, "engines": { - "node": ">=6.0" + "node": "^18.18.0 || ^20.9.0 || >=21.1.0" }, - "peerDependenciesMeta": { - "supports-color": { - "optional": true - } + "funding": { + "url": "https://opencollective.com/eslint" } }, - "node_modules/decode-named-character-reference": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/decode-named-character-reference/-/decode-named-character-reference-1.2.0.tgz", - "integrity": "sha512-c6fcElNV6ShtZXmsgNgFFV5tVX2PaV4g+MOAkb8eXHvn6sryJBrZa9r0zV6+dtTyoCKxtDy5tyQ5ZwQuidtd+Q==", + "node_modules/esprima": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/esprima/-/esprima-4.0.1.tgz", + "integrity": "sha512-eGuFFw7Upda+g4p+QHvnW0RyTX/SVeJBDM/gCtMARO0cLuT2HcEKnTPvhjV6aGeqrCB/sbNop0Kszm0jsaWU4A==", "dev": true, - "license": "MIT", - "dependencies": { - "character-entities": "^2.0.0" + "license": "BSD-2-Clause", + "bin": { + "esparse": "bin/esparse.js", + "esvalidate": "bin/esvalidate.js" }, - "funding": { - "type": "github", - "url": "https://github.com/sponsors/wooorm" + "engines": { + "node": ">=4" } }, - "node_modules/dedent": { - "version": "1.6.0", - "resolved": "https://registry.npmjs.org/dedent/-/dedent-1.6.0.tgz", - "integrity": "sha512-F1Z+5UCFpmQUzJa11agbyPVMbpgT/qA3/SKyJ1jyBgm7dUcUEa8v9JwDkerSQXfakBwFljIxhOJqGkjUwZ9FSA==", + "node_modules/esquery": { + "version": "1.7.0", + "resolved": "https://registry.npmjs.org/esquery/-/esquery-1.7.0.tgz", + "integrity": "sha512-Ap6G0WQwcU/LHsvLwON1fAQX9Zp0A2Y6Y/cJBl9r/JbW90Zyg4/zbG6zzKa2OTALELarYHmKu0GhpM5EO+7T0g==", "dev": true, - "license": "MIT", - "peerDependencies": { - "babel-plugin-macros": "^3.1.0" + "license": "BSD-3-Clause", + "dependencies": { + "estraverse": "^5.1.0" }, - "peerDependenciesMeta": { - "babel-plugin-macros": { - "optional": true - } + "engines": { + "node": ">=0.10" } }, - "node_modules/dedent-js": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/dedent-js/-/dedent-js-1.0.1.tgz", - "integrity": "sha512-OUepMozQULMLUmhxS95Vudo0jb0UchLimi3+pQ2plj61Fcy8axbP9hbiD4Sz6DPqn6XG3kfmziVfQ1rSys5AJQ==", - "dev": true, - "license": "MIT" - }, - "node_modules/deep-eql": { - "version": "5.0.2", - "resolved": "https://registry.npmjs.org/deep-eql/-/deep-eql-5.0.2.tgz", - "integrity": "sha512-h5k/5U50IJJFpzfL6nO9jaaumfjO/f2NjK/oYB2Djzm4p9L+3T9qWpZqZ2hAbLPuuYq9wrU08WQyBTL5GbPk5Q==", + "node_modules/esrap": { + "version": "1.4.9", + "resolved": "https://registry.npmjs.org/esrap/-/esrap-1.4.9.tgz", + "integrity": "sha512-3OMlcd0a03UGuZpPeUC1HxR3nA23l+HEyCiZw3b3FumJIN9KphoGzDJKMXI1S72jVS1dsenDyQC0kJlO1U9E1g==", "dev": true, "license": "MIT", - "engines": { - "node": ">=6" + "dependencies": { + "@jridgewell/sourcemap-codec": "^1.4.15" } }, - "node_modules/deep-is": { - "version": "0.1.4", - "resolved": "https://registry.npmjs.org/deep-is/-/deep-is-0.1.4.tgz", - "integrity": "sha512-oIPzksmTg4/MriiaYGO+okXDT7ztn/w3Eptv/+gSIdMdKsJo0u4CfYNFJPy+4SKMuCqGw2wxnA+URMg3t8a/bQ==", + "node_modules/esrecurse": { + "version": "4.3.0", + "resolved": "https://registry.npmjs.org/esrecurse/-/esrecurse-4.3.0.tgz", + "integrity": "sha512-KmfKL3b6G+RXvP8N1vr3Tq1kL/oCFgn2NYXEtqP8/L3pKapUA4G8cFVaoF3SU323CD4XypR/ffioHmkti6/Tag==", "dev": true, - "license": "MIT" + "license": "BSD-2-Clause", + "dependencies": { + "estraverse": "^5.2.0" + }, + "engines": { + "node": ">=4.0" + } }, - "node_modules/deepmerge": { - "version": "4.3.1", - "resolved": "https://registry.npmjs.org/deepmerge/-/deepmerge-4.3.1.tgz", - "integrity": "sha512-3sUqbMEc77XqpdNO7FRyRog+eW3ph+GYCbj+rK+uYyRMuwsVy0rMiVtPn+QJlKFvWP/1PYpapqYn0Me2knFn+A==", + "node_modules/estraverse": { + "version": "5.3.0", + "resolved": "https://registry.npmjs.org/estraverse/-/estraverse-5.3.0.tgz", + "integrity": "sha512-MMdARuVEQziNTeJD8DgMqmhwR11BRQ/cBP+pLtYdSTnf3MIO8fFeiINEbX36ZdNlfU/7A9f3gUw49B3oQsvwBA==", "dev": true, - "license": "MIT", + "license": "BSD-2-Clause", "engines": { - "node": ">=0.10.0" + "node": ">=4.0" } }, - "node_modules/default-browser": { - "version": "5.5.0", - "resolved": "https://registry.npmjs.org/default-browser/-/default-browser-5.5.0.tgz", - "integrity": "sha512-H9LMLr5zwIbSxrmvikGuI/5KGhZ8E2zH3stkMgM5LpOWDutGM2JZaj460Udnf1a+946zc7YBgrqEWwbk7zHvGw==", + "node_modules/estree-walker": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/estree-walker/-/estree-walker-3.0.3.tgz", + "integrity": "sha512-7RUKfXgSMMkzt6ZuXmqapOurLGPPfgj6l9uRZ7lRGolvk0y2yocc35LdcxKC5PQZdn2DMqioAQ2NoWcrTKmm6g==", "dev": true, "license": "MIT", "dependencies": { - "bundle-name": "^4.1.0", - "default-browser-id": "^5.0.0" - }, + "@types/estree": "^1.0.0" + } + }, + "node_modules/esutils": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/esutils/-/esutils-2.0.3.tgz", + "integrity": "sha512-kVscqXk4OCp68SZ0dkgEKVi6/8ij300KBWTJq32P/dYeWTSwK41WyTxalN1eRmA5Z9UU/LX9D7FWSmV9SAYx6g==", + "dev": true, + "license": "BSD-2-Clause", "engines": { - "node": ">=18" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" + "node": ">=0.10.0" } }, - "node_modules/default-browser-id": { - "version": "5.0.1", - "resolved": "https://registry.npmjs.org/default-browser-id/-/default-browser-id-5.0.1.tgz", - "integrity": "sha512-x1VCxdX4t+8wVfd1so/9w+vQ4vx7lKd2Qp5tDRutErwmR85OgmfX7RlLRMWafRMY7hbEiXIbudNrjOAPa/hL8Q==", + "node_modules/eta": { + "version": "4.6.0", + "resolved": "https://registry.npmjs.org/eta/-/eta-4.6.0.tgz", + "integrity": "sha512-lW6is4T1NFOYnmqGZIfvixqj7A7sSvScF+DN8EK6K58xI5MZ5UvYe0GjopxOXQtZvUn4eDdVuZ8XSoYWTMEKwA==", "dev": true, "license": "MIT", "engines": { - "node": ">=18" + "node": ">=20" }, "funding": { - "url": "https://github.com/sponsors/sindresorhus" + "url": "https://github.com/bgub/eta?sponsor=1" } }, - "node_modules/define-lazy-prop": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/define-lazy-prop/-/define-lazy-prop-3.0.0.tgz", - "integrity": "sha512-N+MeXYoqr3pOgn8xfyRPREN7gHakLYjhsHhWGT3fWAiL4IkAt0iDw14QiiEm2bE30c5XX5q0FtAA3CK5f9/BUg==", + "node_modules/etag": { + "version": "1.8.1", + "resolved": "https://registry.npmjs.org/etag/-/etag-1.8.1.tgz", + "integrity": "sha512-aIL5Fx7mawVa300al2BnEE4iNvo1qETxLrPI/o05L7z6go7fCw1J6EQmbK4FmJ2AS7kgVF/KEZWufBfdClMcPg==", "dev": true, "license": "MIT", "engines": { - "node": ">=12" - }, - "funding": { - "url": "https://github.com/sponsors/sindresorhus" + "node": ">= 0.6" } }, - "node_modules/delaunator": { - "version": "5.1.0", - "resolved": "https://registry.npmjs.org/delaunator/-/delaunator-5.1.0.tgz", - "integrity": "sha512-AGrQ4QSgssa1NGmWmLPqN5NY2KajF5MqxetNEO+o0n3ZwZZeTmt7bBnvzHWrmkZFxGgr4HdyFgelzgi06otLuQ==", + "node_modules/eventemitter3": { + "version": "4.0.7", + "resolved": "https://registry.npmjs.org/eventemitter3/-/eventemitter3-4.0.7.tgz", + "integrity": "sha512-8guHBZCwKnFhYdHr2ysuRWErTwhoN2X8XELRlrRwpmfeY2jjuUN4taQMsULKUVo1K4DvZl+0pgfyoysHxvmvEw==", "dev": true, - "license": "ISC", - "dependencies": { - "robust-predicates": "^3.0.2" - } + "license": "MIT" }, - "node_modules/depd": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/depd/-/depd-2.0.0.tgz", - "integrity": "sha512-g7nH6P6dyDioJogAAGprGpCtVImJhpPk/roCzdb3fIh61/s/nPsfR6onyMwkCAR/OlC3yBC0lESvUoQEAssIrw==", + "node_modules/eventsource": { + "version": "3.0.7", + "resolved": "https://registry.npmjs.org/eventsource/-/eventsource-3.0.7.tgz", + "integrity": "sha512-CRT1WTyuQoD771GW56XEZFQ/ZoSfWid1alKGDYMmkt2yl8UXrVR4pspqWNEcqKvVIzg6PAltWjxcSSPrboA4iA==", "dev": true, "license": "MIT", + "dependencies": { + "eventsource-parser": "^3.0.1" + }, "engines": { - "node": ">= 0.8" + "node": ">=18.0.0" } }, - "node_modules/dequal": { - "version": "2.0.3", - "resolved": "https://registry.npmjs.org/dequal/-/dequal-2.0.3.tgz", - "integrity": "sha512-0je+qPKHEMohvfRTCEo3CrPG6cAzAYgmzKyxRiYSSDkS6eGJdyVJm7WaYA5ECaAD9wLB2T4EEeymA5aFVcYXCA==", + "node_modules/eventsource-parser": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/eventsource-parser/-/eventsource-parser-3.1.0.tgz", + "integrity": "sha512-kJezFj9YFAMLeORyi7aCLxLbD5/qWMQnoMVlVPyHIll7lgRJCc3JVln9Vgl9nwQi0YkMnhdGTMNn7CkRRAptMg==", "dev": true, "license": "MIT", "engines": { - "node": ">=6" + "node": ">=18.0.0" } }, - "node_modules/detect-libc": { - "version": "2.0.4", - "resolved": "https://registry.npmjs.org/detect-libc/-/detect-libc-2.0.4.tgz", - "integrity": "sha512-3UDv+G9CsCKO1WKMGw9fwq/SWJYbI0c5Y7LU1AXYoDdbhE2AHQ6N6Nb34sG8Fj7T5APy8qXDCKuuIHd1BR0tVA==", + "node_modules/expect-type": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/expect-type/-/expect-type-1.3.0.tgz", + "integrity": "sha512-knvyeauYhqjOYvQ66MznSMs83wmHrCycNEN6Ao+2AeYEfxUIkuiVxdEa1qlGEPK+We3n0THiDciYSsCcgW/DoA==", "dev": true, "license": "Apache-2.0", "engines": { - "node": ">=8" + "node": ">=12.0.0" } }, - "node_modules/devalue": { - "version": "5.8.1", - "resolved": "https://registry.npmjs.org/devalue/-/devalue-5.8.1.tgz", - "integrity": "sha512-4CXDYRBGqN+57wVJkuXBYmpAVUSg3L6JAQa/DFqm238G73E1wuyc/JhGQJzN7vUf/CMphYau2zXbfWzDR5aTEw==", + "node_modules/express": { + "version": "5.2.1", + "resolved": "https://registry.npmjs.org/express/-/express-5.2.1.tgz", + "integrity": "sha512-hIS4idWWai69NezIdRt2xFVofaF4j+6INOpJlVOLDO8zXGpUVEVzIYk12UUi2JzjEzWL3IOAxcTubgz9Po0yXw==", "dev": true, - "license": "MIT" + "license": "MIT", + "dependencies": { + "accepts": "^2.0.0", + "body-parser": "^2.2.1", + "content-disposition": "^1.0.0", + "content-type": "^1.0.5", + "cookie": "^0.7.1", + "cookie-signature": "^1.2.1", + "debug": "^4.4.0", + "depd": "^2.0.0", + "encodeurl": "^2.0.0", + "escape-html": "^1.0.3", + "etag": "^1.8.1", + "finalhandler": "^2.1.0", + "fresh": "^2.0.0", + "http-errors": "^2.0.0", + "merge-descriptors": "^2.0.0", + "mime-types": "^3.0.0", + "on-finished": "^2.4.1", + "once": "^1.4.0", + "parseurl": "^1.3.3", + "proxy-addr": "^2.0.7", + "qs": "^6.14.0", + "range-parser": "^1.2.1", + "router": "^2.2.0", + "send": "^1.1.0", + "serve-static": "^2.2.0", + "statuses": "^2.0.1", + "type-is": "^2.0.1", + "vary": "^1.1.2" + }, + "engines": { + "node": ">= 18" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/express" + } }, - "node_modules/devlop": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/devlop/-/devlop-1.1.0.tgz", - "integrity": "sha512-RWmIqhcFf1lRYBvNmr7qTNuyCt/7/ns2jbpp1+PalgE/rDQcBT0fioSMUpJ93irlUhC5hrg4cYqe6U+0ImW0rA==", + "node_modules/express-rate-limit": { + "version": "8.5.2", + "resolved": "https://registry.npmjs.org/express-rate-limit/-/express-rate-limit-8.5.2.tgz", + "integrity": "sha512-5Kb34ipNX694DH48vN9irak1Qx30nb0PLYHXfJgw4YEjiC3ZEmZJhwOp+VfiCYwFzvFTdB9QkArYS5kXa2cx2A==", "dev": true, "license": "MIT", "dependencies": { - "dequal": "^2.0.0" + "ip-address": "^10.2.0" + }, + "engines": { + "node": ">= 16" }, "funding": { - "type": "github", - "url": "https://github.com/sponsors/wooorm" + "url": "https://github.com/sponsors/express-rate-limit" + }, + "peerDependencies": { + "express": ">= 4.11" } }, - "node_modules/dexie": { - "version": "4.0.11", - "resolved": "https://registry.npmjs.org/dexie/-/dexie-4.0.11.tgz", - "integrity": "sha512-SOKO002EqlvBYYKQSew3iymBoN2EQ4BDw/3yprjh7kAfFzjBYkaMNa/pZvcA7HSWlcKSQb9XhPe3wKyQ0x4A8A==", + "node_modules/extend": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/extend/-/extend-3.0.2.tgz", + "integrity": "sha512-fjquC59cD7CyW6urNXK0FBufkZcoiGG80wTuPujX590cB5Ttln20E2UB4S/WARVqhXffZl2LNgS+gQdPIIim/g==", "dev": true, - "license": "Apache-2.0" + "license": "MIT" }, - "node_modules/dom-accessibility-api": { - "version": "0.5.16", - "resolved": "https://registry.npmjs.org/dom-accessibility-api/-/dom-accessibility-api-0.5.16.tgz", - "integrity": "sha512-X7BJ2yElsnOJ30pZF4uIIDfBEVgF4XEBxL9Bxhy6dnrm5hkzqmsWHGTiHqRiITNhMyFLyAiWndIJP7Z1NTteDg==", + "node_modules/fast-deep-equal": { + "version": "3.1.3", + "resolved": "https://registry.npmjs.org/fast-deep-equal/-/fast-deep-equal-3.1.3.tgz", + "integrity": "sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q==", "dev": true, - "license": "MIT", - "peer": true + "license": "MIT" }, - "node_modules/dompurify": { - "version": "3.4.8", - "resolved": "https://registry.npmjs.org/dompurify/-/dompurify-3.4.8.tgz", - "integrity": "sha512-yb1cEmaOum7wFvOCSQxyfgVlv5D47Rc30iZWoMpbDIWTnJ6grDDQyu2KFJzB2k7u0pMuJcQ1zphH//fFnw2tjQ==", + "node_modules/fast-json-stable-stringify": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/fast-json-stable-stringify/-/fast-json-stable-stringify-2.1.0.tgz", + "integrity": "sha512-lhd/wF+Lk98HZoTCtlVraHtfh5XYijIjalXck7saUtuanSDyLMxnHhSXEDJqHxD7msR8D0uCmqlkwjCV8xvwHw==", "dev": true, - "license": "(MPL-2.0 OR Apache-2.0)", - "optionalDependencies": { - "@types/trusted-types": "^2.0.7" - } + "license": "MIT" }, - "node_modules/dunder-proto": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz", - "integrity": "sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A==", + "node_modules/fast-levenshtein": { + "version": "2.0.6", + "resolved": "https://registry.npmjs.org/fast-levenshtein/-/fast-levenshtein-2.0.6.tgz", + "integrity": "sha512-DCXu6Ifhqcks7TZKY3Hxp3y6qphY5SJZmrWMDrKcERSOXWQdMhU9Ig/PYrzyw/ul9jOIyh0N4M0tbC5hodg8dw==", + "dev": true, + "license": "MIT" + }, + "node_modules/fast-uri": { + "version": "3.1.2", + "resolved": "https://registry.npmjs.org/fast-uri/-/fast-uri-3.1.2.tgz", + "integrity": "sha512-rVjf7ArG3LTk+FS6Yw81V1DLuZl1bRbNrev6Tmd/9RaroeeRRJhAt7jg/6YFxbvAQXUCavSoZhPPj6oOx+5KjQ==", + "dev": true, + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/fastify" + }, + { + "type": "opencollective", + "url": "https://opencollective.com/fastify" + } + ], + "license": "BSD-3-Clause" + }, + "node_modules/fdir": { + "version": "6.5.0", + "resolved": "https://registry.npmjs.org/fdir/-/fdir-6.5.0.tgz", + "integrity": "sha512-tIbYtZbucOs0BRGqPJkshJUYdL+SDH7dVM8gjy+ERp3WAUjLEFJE+02kanyHtwjWOnwrKYBiwAmM0p4kLJAnXg==", "dev": true, "license": "MIT", - "dependencies": { - "call-bind-apply-helpers": "^1.0.1", - "es-errors": "^1.3.0", - "gopd": "^1.2.0" - }, "engines": { - "node": ">= 0.4" + "node": ">=12.0.0" + }, + "peerDependencies": { + "picomatch": "^3 || ^4" + }, + "peerDependenciesMeta": { + "picomatch": { + "optional": true + } } }, - "node_modules/ee-first": { - "version": "1.1.1", - "resolved": "https://registry.npmjs.org/ee-first/-/ee-first-1.1.1.tgz", - "integrity": "sha512-WMwm9LhRUo+WUaRN+vRuETqG89IgZphVSNkdFgeb6sS/E4OrDIN7t48CAewSHXc6C8lefD8KKfr5vY61brQlow==", + "node_modules/fflate": { + "version": "0.8.3", + "resolved": "https://registry.npmjs.org/fflate/-/fflate-0.8.3.tgz", + "integrity": "sha512-tbZNuJrLwGUp3zshBtdy4W+ORxZuIh8a5ilyIEQDC5rY1f3U20JMry0Ll3WBzU58EZKsEuJFXhb5gwv8CsPvgA==", "dev": true, "license": "MIT" }, - "node_modules/encodeurl": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/encodeurl/-/encodeurl-2.0.0.tgz", - "integrity": "sha512-Q0n9HRi4m6JuGIV1eFlmvJB7ZEVxu93IrMyiMsGC0lrMJMWzRgx6WGquyfQgZVb31vhGgXnfmPNNXmxnOkRBrg==", + "node_modules/file-entry-cache": { + "version": "8.0.0", + "resolved": "https://registry.npmjs.org/file-entry-cache/-/file-entry-cache-8.0.0.tgz", + "integrity": "sha512-XXTUwCvisa5oacNGRP9SfNtYBNAMi+RPwBFmblZEF7N7swHYQS6/Zfk7SRwx4D5j3CH211YNRco1DEMNVfZCnQ==", "dev": true, "license": "MIT", + "dependencies": { + "flat-cache": "^4.0.0" + }, "engines": { - "node": ">= 0.8" + "node": ">=16.0.0" } }, - "node_modules/enhanced-resolve": { - "version": "5.18.2", - "resolved": "https://registry.npmjs.org/enhanced-resolve/-/enhanced-resolve-5.18.2.tgz", - "integrity": "sha512-6Jw4sE1maoRJo3q8MsSIn2onJFbLTOjY9hlx4DZXmOKvLRd1Ok2kXmAGXaafL2+ijsJZ1ClYbl/pmqr9+k4iUQ==", + "node_modules/filelist": { + "version": "1.0.6", + "resolved": "https://registry.npmjs.org/filelist/-/filelist-1.0.6.tgz", + "integrity": "sha512-5giy2PkLYY1cP39p17Ech+2xlpTRL9HLspOfEgm0L6CwBXBTgsK5ou0JtzYuepxkaQ/tvhCFIJ5uXo0OrM2DxA==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "minimatch": "^5.0.1" + } + }, + "node_modules/filelist/node_modules/brace-expansion": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.1.1.tgz", + "integrity": "sha512-WR1cURNjuvBLMZBMbqM0UoE+WAfdUcEV1ccD8PVBVOI+Z3ND4+SZbN8RsfT2bMuG1qwz5RFvPukSZm5fF2D5eA==", "dev": true, "license": "MIT", "dependencies": { - "graceful-fs": "^4.2.4", - "tapable": "^2.2.0" + "balanced-match": "^1.0.0" + } + }, + "node_modules/filelist/node_modules/minimatch": { + "version": "5.1.9", + "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-5.1.9.tgz", + "integrity": "sha512-7o1wEA2RyMP7Iu7GNba9vc0RWWGACJOCZBJX2GJWip0ikV+wcOsgVuY9uE8CPiyQhkGFSlhuSkZPavN7u1c2Fw==", + "dev": true, + "license": "ISC", + "dependencies": { + "brace-expansion": "^2.0.1" }, "engines": { - "node": ">=10.13.0" + "node": ">=10" } }, - "node_modules/entities": { - "version": "6.0.1", - "resolved": "https://registry.npmjs.org/entities/-/entities-6.0.1.tgz", - "integrity": "sha512-aN97NXWF6AWBTahfVOIrB/NShkzi5H7F9r1s9mD3cDj4Ko5f2qhhVoYMibXF7GlLveb/D2ioWay8lxI97Ven3g==", + "node_modules/filesize": { + "version": "10.1.6", + "resolved": "https://registry.npmjs.org/filesize/-/filesize-10.1.6.tgz", + "integrity": "sha512-sJslQKU2uM33qH5nqewAwVB2QgR6w1aMNsYUp3aN5rMRyXEwJGmZvaWzeJFNTOXWlHQyBFCWrdj3fV/fsTOX8w==", "dev": true, - "license": "BSD-2-Clause", + "license": "BSD-3-Clause", "engines": { - "node": ">=0.12" - }, - "funding": { - "url": "https://github.com/fb55/entities?sponsor=1" + "node": ">= 10.4.0" } - }, - "node_modules/es-define-property": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/es-define-property/-/es-define-property-1.0.1.tgz", - "integrity": "sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g==", + }, + "node_modules/finalhandler": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/finalhandler/-/finalhandler-2.1.1.tgz", + "integrity": "sha512-S8KoZgRZN+a5rNwqTxlZZePjT/4cnm0ROV70LedRHZ0p8u9fRID0hJUZQpkKLzro8LfmC8sx23bY6tVNxv8pQA==", "dev": true, "license": "MIT", + "dependencies": { + "debug": "^4.4.0", + "encodeurl": "^2.0.0", + "escape-html": "^1.0.3", + "on-finished": "^2.4.1", + "parseurl": "^1.3.3", + "statuses": "^2.0.1" + }, "engines": { - "node": ">= 0.4" + "node": ">= 18.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/express" } }, - "node_modules/es-errors": { - "version": "1.3.0", - "resolved": "https://registry.npmjs.org/es-errors/-/es-errors-1.3.0.tgz", - "integrity": "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw==", + "node_modules/find-up": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/find-up/-/find-up-5.0.0.tgz", + "integrity": "sha512-78/PXT1wlLLDgTzDs7sjq9hzz0vXD+zn+7wypEe4fXQxCmdmqfGsEPQxmiCSQI3ajFV91bVSsvNtrJRiW6nGng==", "dev": true, "license": "MIT", + "dependencies": { + "locate-path": "^6.0.0", + "path-exists": "^4.0.0" + }, "engines": { - "node": ">= 0.4" + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" } }, - "node_modules/es-module-lexer": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/es-module-lexer/-/es-module-lexer-2.1.0.tgz", - "integrity": "sha512-n27zTYMjYu1aj4MjCWzSP7G9r75utsaoc8m61weK+W8JMBGGQybd43GstCXZ3WNmSFtGT9wi59qQTW6mhTR5LQ==", - "dev": true, - "license": "MIT" - }, - "node_modules/es-object-atoms": { - "version": "1.1.1", - "resolved": "https://registry.npmjs.org/es-object-atoms/-/es-object-atoms-1.1.1.tgz", - "integrity": "sha512-FGgH2h8zKNim9ljj7dankFPcICIK9Cp5bm+c2gQSYePhpaG5+esrLODihIorn+Pe6FGJzWhXQotPv73jTaldXA==", + "node_modules/flat-cache": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/flat-cache/-/flat-cache-4.0.1.tgz", + "integrity": "sha512-f7ccFPK3SXFHpx15UIGyRJ/FJQctuKZ0zVuN3frBo4HnK3cay9VEW0R6yPYFHC0AgqhukPzKjq22t5DmAyqGyw==", "dev": true, "license": "MIT", "dependencies": { - "es-errors": "^1.3.0" + "flatted": "^3.2.9", + "keyv": "^4.5.4" }, "engines": { - "node": ">= 0.4" + "node": ">=16" } }, - "node_modules/es-toolkit": { - "version": "1.46.1", - "resolved": "https://registry.npmjs.org/es-toolkit/-/es-toolkit-1.46.1.tgz", - "integrity": "sha512-5eNtXOs3tbfxXOj04tjjseeWkRWaoCjdEI+96DgwzZoe6c9juL49pXlzAFTI72aWC9Y8p7168g6XIKjh7k6pyQ==", + "node_modules/flatted": { + "version": "3.4.2", + "resolved": "https://registry.npmjs.org/flatted/-/flatted-3.4.2.tgz", + "integrity": "sha512-PjDse7RzhcPkIJwy5t7KPWQSZ9cAbzQXcafsetQoD7sOJRQlGikNbx7yZp2OotDnJyrDcbyRq3Ttb18iYOqkxA==", "dev": true, - "license": "MIT", - "workspaces": [ - "docs", - "benchmarks" - ] + "license": "ISC" }, - "node_modules/esbuild": { - "version": "0.27.7", - "resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.27.7.tgz", - "integrity": "sha512-IxpibTjyVnmrIQo5aqNpCgoACA/dTKLTlhMHihVHhdkxKyPO1uBBthumT0rdHmcsk9uMonIWS0m4FljWzILh3w==", + "node_modules/follow-redirects": { + "version": "1.16.0", + "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.16.0.tgz", + "integrity": "sha512-y5rN/uOsadFT/JfYwhxRS5R7Qce+g3zG97+JrtFZlC9klX/W5hD7iiLzScI4nZqUS7DNUdhPgw4xI8W2LuXlUw==", "dev": true, - "hasInstallScript": true, + "funding": [ + { + "type": "individual", + "url": "https://github.com/sponsors/RubenVerborgh" + } + ], "license": "MIT", - "bin": { - "esbuild": "bin/esbuild" - }, "engines": { - "node": ">=18" + "node": ">=4.0" }, - "optionalDependencies": { - "@esbuild/aix-ppc64": "0.27.7", - "@esbuild/android-arm": "0.27.7", - "@esbuild/android-arm64": "0.27.7", - "@esbuild/android-x64": "0.27.7", - "@esbuild/darwin-arm64": "0.27.7", - "@esbuild/darwin-x64": "0.27.7", - "@esbuild/freebsd-arm64": "0.27.7", - "@esbuild/freebsd-x64": "0.27.7", - "@esbuild/linux-arm": "0.27.7", - "@esbuild/linux-arm64": "0.27.7", - "@esbuild/linux-ia32": "0.27.7", - "@esbuild/linux-loong64": "0.27.7", - "@esbuild/linux-mips64el": "0.27.7", - "@esbuild/linux-ppc64": "0.27.7", - "@esbuild/linux-riscv64": "0.27.7", - "@esbuild/linux-s390x": "0.27.7", - "@esbuild/linux-x64": "0.27.7", - "@esbuild/netbsd-arm64": "0.27.7", - "@esbuild/netbsd-x64": "0.27.7", - "@esbuild/openbsd-arm64": "0.27.7", - "@esbuild/openbsd-x64": "0.27.7", - "@esbuild/openharmony-arm64": "0.27.7", - "@esbuild/sunos-x64": "0.27.7", - "@esbuild/win32-arm64": "0.27.7", - "@esbuild/win32-ia32": "0.27.7", - "@esbuild/win32-x64": "0.27.7" + "peerDependenciesMeta": { + "debug": { + "optional": true + } } }, - "node_modules/escape-html": { - "version": "1.0.3", - "resolved": "https://registry.npmjs.org/escape-html/-/escape-html-1.0.3.tgz", - "integrity": "sha512-NiSupZ4OeuGwr68lGIeym/ksIZMJodUGOSCZ/FSnTxcrekbvqrgdUxlJOMpijaKZVjAJrWrGs/6Jy8OMuyj9ow==", - "dev": true, - "license": "MIT" - }, - "node_modules/escape-string-regexp": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-4.0.0.tgz", - "integrity": "sha512-TtpcNJ3XAzx3Gq8sWRzJaVajRs0uVxA2YAkdb1jm2YkPz4G6egUFAyA3n5vtEIZefPk5Wa4UXbKuS5fKkJWdgA==", + "node_modules/for-each": { + "version": "0.3.5", + "resolved": "https://registry.npmjs.org/for-each/-/for-each-0.3.5.tgz", + "integrity": "sha512-dKx12eRCVIzqCxFGplyFKJMPvLEWgmNtUrpTiJIR5u97zEhRG8ySrtboPHZXx7daLxQVrl643cTzbab2tkQjxg==", "dev": true, "license": "MIT", + "dependencies": { + "is-callable": "^1.2.7" + }, "engines": { - "node": ">=10" + "node": ">= 0.4" }, "funding": { - "url": "https://github.com/sponsors/sindresorhus" + "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/eslint": { - "version": "9.39.2", - "resolved": "https://registry.npmjs.org/eslint/-/eslint-9.39.2.tgz", - "integrity": "sha512-LEyamqS7W5HB3ujJyvi0HQK/dtVINZvd5mAAp9eT5S/ujByGjiZLCzPcHVzuXbpJDJF/cxwHlfceVUDZ2lnSTw==", + "node_modules/foreground-child": { + "version": "3.3.1", + "resolved": "https://registry.npmjs.org/foreground-child/-/foreground-child-3.3.1.tgz", + "integrity": "sha512-gIXjKqtFuWEgzFRJA9WCQeSJLZDjgJUOMCMzxtvFq/37KojM1BFGufqsCy0r4qSQmYLsZYMeyRqzIWOMup03sw==", "dev": true, - "license": "MIT", + "license": "ISC", "dependencies": { - "@eslint-community/eslint-utils": "^4.8.0", - "@eslint-community/regexpp": "^4.12.1", - "@eslint/config-array": "^0.21.1", - "@eslint/config-helpers": "^0.4.2", - "@eslint/core": "^0.17.0", - "@eslint/eslintrc": "^3.3.1", - "@eslint/js": "9.39.2", - "@eslint/plugin-kit": "^0.4.1", - "@humanfs/node": "^0.16.6", - "@humanwhocodes/module-importer": "^1.0.1", - "@humanwhocodes/retry": "^0.4.2", - "@types/estree": "^1.0.6", - "ajv": "^6.12.4", - "chalk": "^4.0.0", "cross-spawn": "^7.0.6", - "debug": "^4.3.2", - "escape-string-regexp": "^4.0.0", - "eslint-scope": "^8.4.0", - "eslint-visitor-keys": "^4.2.1", - "espree": "^10.4.0", - "esquery": "^1.5.0", - "esutils": "^2.0.2", - "fast-deep-equal": "^3.1.3", - "file-entry-cache": "^8.0.0", - "find-up": "^5.0.0", - "glob-parent": "^6.0.2", - "ignore": "^5.2.0", - "imurmurhash": "^0.1.4", - "is-glob": "^4.0.0", - "json-stable-stringify-without-jsonify": "^1.0.1", - "lodash.merge": "^4.6.2", - "minimatch": "^3.1.2", - "natural-compare": "^1.4.0", - "optionator": "^0.9.3" - }, - "bin": { - "eslint": "bin/eslint.js" + "signal-exit": "^4.0.1" }, "engines": { - "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + "node": ">=14" }, "funding": { - "url": "https://eslint.org/donate" - }, - "peerDependencies": { - "jiti": "*" - }, - "peerDependenciesMeta": { - "jiti": { - "optional": true - } + "url": "https://github.com/sponsors/isaacs" } }, - "node_modules/eslint-config-prettier": { - "version": "10.1.8", - "resolved": "https://registry.npmjs.org/eslint-config-prettier/-/eslint-config-prettier-10.1.8.tgz", - "integrity": "sha512-82GZUjRS0p/jganf6q1rEO25VSoHH0hKPCTrgillPjdI/3bgBhAE1QzHrHTizjpRvy6pGAvKjDJtk2pF9NDq8w==", + "node_modules/forwarded": { + "version": "0.2.0", + "resolved": "https://registry.npmjs.org/forwarded/-/forwarded-0.2.0.tgz", + "integrity": "sha512-buRG0fpBtRHSTCOASe6hD258tEubFoRLb4ZNA6NxMVHNw2gOcwHo9wyablzMzOA5z9xA9L1KNjk/Nt6MT9aYow==", "dev": true, "license": "MIT", - "bin": { - "eslint-config-prettier": "bin/cli.js" - }, - "funding": { - "url": "https://opencollective.com/eslint-config-prettier" - }, - "peerDependencies": { - "eslint": ">=7.0.0" + "engines": { + "node": ">= 0.6" } }, - "node_modules/eslint-plugin-storybook": { - "version": "10.2.4", - "resolved": "https://registry.npmjs.org/eslint-plugin-storybook/-/eslint-plugin-storybook-10.2.4.tgz", - "integrity": "sha512-D8a6Y+iun2MSOpgps0Vd/t8y9Y5ZZ7O2VeKqw2PCv2+b7yInqogOS2VBMSRZVfP8TTGQgDpbUK67k7KZEUC7Ng==", + "node_modules/fresh": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/fresh/-/fresh-2.0.0.tgz", + "integrity": "sha512-Rx/WycZ60HOaqLKAi6cHRKKI7zxWbJ31MhntmtwMoaTeF7XFH9hhBp8vITaMidfljRQ6eYWCKkaTK+ykVJHP2A==", "dev": true, "license": "MIT", - "dependencies": { - "@typescript-eslint/utils": "^8.48.0" - }, - "peerDependencies": { - "eslint": ">=8", - "storybook": "^10.2.4" + "engines": { + "node": ">= 0.8" } }, - "node_modules/eslint-plugin-svelte": { - "version": "3.15.0", - "resolved": "https://registry.npmjs.org/eslint-plugin-svelte/-/eslint-plugin-svelte-3.15.0.tgz", - "integrity": "sha512-QKB7zqfuB8aChOfBTComgDptMf2yxiJx7FE04nneCmtQzgTHvY8UJkuh8J2Rz7KB9FFV9aTHX6r7rdYGvG8T9Q==", + "node_modules/fs-extra": { + "version": "9.1.0", + "resolved": "https://registry.npmjs.org/fs-extra/-/fs-extra-9.1.0.tgz", + "integrity": "sha512-hcg3ZmepS30/7BSFqRvoo3DOMQu7IjqxO5nCDt+zM9XWjb33Wg7ziNT+Qvqbuc3+gWpzO02JubVyk2G4Zvo1OQ==", "dev": true, "license": "MIT", "dependencies": { - "@eslint-community/eslint-utils": "^4.6.1", - "@jridgewell/sourcemap-codec": "^1.5.0", - "esutils": "^2.0.3", - "globals": "^16.0.0", - "known-css-properties": "^0.37.0", - "postcss": "^8.4.49", - "postcss-load-config": "^3.1.4", - "postcss-safe-parser": "^7.0.0", - "semver": "^7.6.3", - "svelte-eslint-parser": "^1.4.0" + "at-least-node": "^1.0.0", + "graceful-fs": "^4.2.0", + "jsonfile": "^6.0.1", + "universalify": "^2.0.0" }, "engines": { - "node": "^18.18.0 || ^20.9.0 || >=21.1.0" - }, - "funding": { - "url": "https://github.com/sponsors/ota-meshi" - }, - "peerDependencies": { - "eslint": "^8.57.1 || ^9.0.0 || ^10.0.0", - "svelte": "^3.37.0 || ^4.0.0 || ^5.0.0" - }, - "peerDependenciesMeta": { - "svelte": { - "optional": true - } + "node": ">=10" + } + }, + "node_modules/fsevents": { + "version": "2.3.2", + "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz", + "integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==", + "dev": true, + "hasInstallScript": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": "^8.16.0 || ^10.6.0 || >=11.0.0" + } + }, + "node_modules/function-bind": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.2.tgz", + "integrity": "sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==", + "dev": true, + "license": "MIT", + "funding": { + "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/eslint-scope": { - "version": "8.4.0", - "resolved": "https://registry.npmjs.org/eslint-scope/-/eslint-scope-8.4.0.tgz", - "integrity": "sha512-sNXOfKCn74rt8RICKMvJS7XKV/Xk9kA7DyJr8mJik3S7Cwgy3qlkkmyS2uQB3jiJg6VNdZd/pDBJu0nvG2NlTg==", + "node_modules/function.prototype.name": { + "version": "1.1.8", + "resolved": "https://registry.npmjs.org/function.prototype.name/-/function.prototype.name-1.1.8.tgz", + "integrity": "sha512-e5iwyodOHhbMr/yNrc7fDYG4qlbIvI5gajyzPnb5TCwyhjApznQh1BMFou9b30SevY43gCJKXycoCBjMbsuW0Q==", "dev": true, - "license": "BSD-2-Clause", + "license": "MIT", "dependencies": { - "esrecurse": "^4.3.0", - "estraverse": "^5.2.0" + "call-bind": "^1.0.8", + "call-bound": "^1.0.3", + "define-properties": "^1.2.1", + "functions-have-names": "^1.2.3", + "hasown": "^2.0.2", + "is-callable": "^1.2.7" }, "engines": { - "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + "node": ">= 0.4" }, "funding": { - "url": "https://opencollective.com/eslint" + "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/eslint-visitor-keys": { - "version": "4.2.1", - "resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-4.2.1.tgz", - "integrity": "sha512-Uhdk5sfqcee/9H/rCOJikYz67o0a2Tw2hGRPOG2Y1R2dg7brRe1uG0yaNQDHu+TO/uQPF/5eCapvYSmHUjt7JQ==", + "node_modules/functions-have-names": { + "version": "1.2.3", + "resolved": "https://registry.npmjs.org/functions-have-names/-/functions-have-names-1.2.3.tgz", + "integrity": "sha512-xckBUXyTIqT97tq2x2AMb+g163b5JFysYk0x4qxNFwbfQkmNZoiRHb6sPzI9/QV33WeuvVYBUIiD4NzNIyqaRQ==", "dev": true, - "license": "Apache-2.0", - "engines": { - "node": "^18.18.0 || ^20.9.0 || >=21.1.0" - }, + "license": "MIT", "funding": { - "url": "https://opencollective.com/eslint" + "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/esm-env": { - "version": "1.2.2", - "resolved": "https://registry.npmjs.org/esm-env/-/esm-env-1.2.2.tgz", - "integrity": "sha512-Epxrv+Nr/CaL4ZcFGPJIYLWFom+YeV1DqMLHJoEd9SYRxNbaFruBwfEX/kkHUJf55j2+TUbmDcmuilbP1TmXHA==", + "node_modules/generator-function": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/generator-function/-/generator-function-2.0.1.tgz", + "integrity": "sha512-SFdFmIJi+ybC0vjlHN0ZGVGHc3lgE0DxPAT0djjVg+kjOnSqclqmj0KQ7ykTOLP6YxoqOvuAODGdcHJn+43q3g==", "dev": true, - "license": "MIT" + "license": "MIT", + "engines": { + "node": ">= 0.4" + } }, - "node_modules/espree": { - "version": "10.4.0", - "resolved": "https://registry.npmjs.org/espree/-/espree-10.4.0.tgz", - "integrity": "sha512-j6PAQ2uUr79PZhBjP5C5fhl8e39FmRnOjsD5lGnWrFU8i2G776tBK7+nP8KuQUTTyAZUwfQqXAgrVH5MbH9CYQ==", + "node_modules/gensync": { + "version": "1.0.0-beta.2", + "resolved": "https://registry.npmjs.org/gensync/-/gensync-1.0.0-beta.2.tgz", + "integrity": "sha512-3hN7NaskYvMDLQY55gnW3NQ+mesEAepTqlg+VEbj7zzqEMBVNhzcGYYeqFo/TlYz6eQiFcp1HcsCZO+nGgS8zg==", "dev": true, - "license": "BSD-2-Clause", + "license": "MIT", + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/get-intrinsic": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.3.0.tgz", + "integrity": "sha512-9fSjSaos/fRIVIp+xSJlE6lfwhES7LNtKaCBIamHsjr2na1BiABJPo0mOjjz8GJDURarmCPGqaiVg5mfjb98CQ==", + "dev": true, + "license": "MIT", "dependencies": { - "acorn": "^8.15.0", - "acorn-jsx": "^5.3.2", - "eslint-visitor-keys": "^4.2.1" + "call-bind-apply-helpers": "^1.0.2", + "es-define-property": "^1.0.1", + "es-errors": "^1.3.0", + "es-object-atoms": "^1.1.1", + "function-bind": "^1.1.2", + "get-proto": "^1.0.1", + "gopd": "^1.2.0", + "has-symbols": "^1.1.0", + "hasown": "^2.0.2", + "math-intrinsics": "^1.1.0" }, "engines": { - "node": "^18.18.0 || ^20.9.0 || >=21.1.0" + "node": ">= 0.4" }, "funding": { - "url": "https://opencollective.com/eslint" + "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/esprima": { - "version": "4.0.1", - "resolved": "https://registry.npmjs.org/esprima/-/esprima-4.0.1.tgz", - "integrity": "sha512-eGuFFw7Upda+g4p+QHvnW0RyTX/SVeJBDM/gCtMARO0cLuT2HcEKnTPvhjV6aGeqrCB/sbNop0Kszm0jsaWU4A==", + "node_modules/get-own-enumerable-property-symbols": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/get-own-enumerable-property-symbols/-/get-own-enumerable-property-symbols-3.0.2.tgz", + "integrity": "sha512-I0UBV/XOz1XkIJHEUDMZAbzCThU/H8DxmSfmdGcKPnVhu2VfFqr34jr9777IyaTYvxjedWhqVIilEDsCdP5G6g==", "dev": true, - "license": "BSD-2-Clause", - "bin": { - "esparse": "bin/esparse.js", - "esvalidate": "bin/esvalidate.js" - }, - "engines": { - "node": ">=4" - } + "license": "ISC" }, - "node_modules/esquery": { - "version": "1.6.0", - "resolved": "https://registry.npmjs.org/esquery/-/esquery-1.6.0.tgz", - "integrity": "sha512-ca9pw9fomFcKPvFLXhBKUK90ZvGibiGOvRJNbjljY7s7uq/5YO4BOzcYtJqExdx99rF6aAcnRxHmcUHcz6sQsg==", + "node_modules/get-proto": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/get-proto/-/get-proto-1.0.1.tgz", + "integrity": "sha512-sTSfBjoXBp89JvIKIefqw7U2CCebsc74kiY6awiGogKtoSGbgjYE/G/+l9sF3MWFPNc9IcoOC4ODfKHfxFmp0g==", "dev": true, - "license": "BSD-3-Clause", + "license": "MIT", "dependencies": { - "estraverse": "^5.1.0" + "dunder-proto": "^1.0.1", + "es-object-atoms": "^1.0.0" }, "engines": { - "node": ">=0.10" + "node": ">= 0.4" } }, - "node_modules/esrap": { - "version": "1.4.9", - "resolved": "https://registry.npmjs.org/esrap/-/esrap-1.4.9.tgz", - "integrity": "sha512-3OMlcd0a03UGuZpPeUC1HxR3nA23l+HEyCiZw3b3FumJIN9KphoGzDJKMXI1S72jVS1dsenDyQC0kJlO1U9E1g==", + "node_modules/get-symbol-description": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/get-symbol-description/-/get-symbol-description-1.1.0.tgz", + "integrity": "sha512-w9UMqWwJxHNOvoNzSJ2oPF5wvYcvP7jUvYzhp67yEhTi17ZDBBC1z9pTdGuzjD+EFIqLSYRweZjqfiPzQ06Ebg==", "dev": true, "license": "MIT", "dependencies": { - "@jridgewell/sourcemap-codec": "^1.4.15" + "call-bound": "^1.0.3", + "es-errors": "^1.3.0", + "get-intrinsic": "^1.2.6" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/esrecurse": { - "version": "4.3.0", - "resolved": "https://registry.npmjs.org/esrecurse/-/esrecurse-4.3.0.tgz", - "integrity": "sha512-KmfKL3b6G+RXvP8N1vr3Tq1kL/oCFgn2NYXEtqP8/L3pKapUA4G8cFVaoF3SU323CD4XypR/ffioHmkti6/Tag==", + "node_modules/glob-parent": { + "version": "6.0.2", + "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-6.0.2.tgz", + "integrity": "sha512-XxwI8EOhVQgWp6iDL+3b0r86f4d6AX6zSU55HfB4ydCEuXLXc5FcYeOu+nnGftS4TEju/11rt4KJPTMgbfmv4A==", "dev": true, - "license": "BSD-2-Clause", + "license": "ISC", "dependencies": { - "estraverse": "^5.2.0" + "is-glob": "^4.0.3" }, "engines": { - "node": ">=4.0" + "node": ">=10.13.0" } }, - "node_modules/estraverse": { - "version": "5.3.0", - "resolved": "https://registry.npmjs.org/estraverse/-/estraverse-5.3.0.tgz", - "integrity": "sha512-MMdARuVEQziNTeJD8DgMqmhwR11BRQ/cBP+pLtYdSTnf3MIO8fFeiINEbX36ZdNlfU/7A9f3gUw49B3oQsvwBA==", + "node_modules/globals": { + "version": "16.5.0", + "resolved": "https://registry.npmjs.org/globals/-/globals-16.5.0.tgz", + "integrity": "sha512-c/c15i26VrJ4IRt5Z89DnIzCGDn9EcebibhAOjw5ibqEHsE1wLUgkPn9RDmNcUKyU87GeaL633nyJ+pplFR2ZQ==", "dev": true, - "license": "BSD-2-Clause", + "license": "MIT", "engines": { - "node": ">=4.0" + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" } }, - "node_modules/estree-walker": { - "version": "3.0.3", - "resolved": "https://registry.npmjs.org/estree-walker/-/estree-walker-3.0.3.tgz", - "integrity": "sha512-7RUKfXgSMMkzt6ZuXmqapOurLGPPfgj6l9uRZ7lRGolvk0y2yocc35LdcxKC5PQZdn2DMqioAQ2NoWcrTKmm6g==", + "node_modules/globalthis": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/globalthis/-/globalthis-1.0.4.tgz", + "integrity": "sha512-DpLKbNU4WylpxJykQujfCcwYWiV/Jhm50Goo0wrVILAv5jOr9d+H+UR3PhSCD2rCCEIg0uc+G+muBTwD54JhDQ==", "dev": true, "license": "MIT", "dependencies": { - "@types/estree": "^1.0.0" - } - }, - "node_modules/esutils": { - "version": "2.0.3", - "resolved": "https://registry.npmjs.org/esutils/-/esutils-2.0.3.tgz", - "integrity": "sha512-kVscqXk4OCp68SZ0dkgEKVi6/8ij300KBWTJq32P/dYeWTSwK41WyTxalN1eRmA5Z9UU/LX9D7FWSmV9SAYx6g==", - "dev": true, - "license": "BSD-2-Clause", + "define-properties": "^1.2.1", + "gopd": "^1.0.1" + }, "engines": { - "node": ">=0.10.0" + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/etag": { - "version": "1.8.1", - "resolved": "https://registry.npmjs.org/etag/-/etag-1.8.1.tgz", - "integrity": "sha512-aIL5Fx7mawVa300al2BnEE4iNvo1qETxLrPI/o05L7z6go7fCw1J6EQmbK4FmJ2AS7kgVF/KEZWufBfdClMcPg==", + "node_modules/gopd": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/gopd/-/gopd-1.2.0.tgz", + "integrity": "sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg==", "dev": true, "license": "MIT", "engines": { - "node": ">= 0.6" + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/eventemitter3": { - "version": "4.0.7", - "resolved": "https://registry.npmjs.org/eventemitter3/-/eventemitter3-4.0.7.tgz", - "integrity": "sha512-8guHBZCwKnFhYdHr2ysuRWErTwhoN2X8XELRlrRwpmfeY2jjuUN4taQMsULKUVo1K4DvZl+0pgfyoysHxvmvEw==", + "node_modules/graceful-fs": { + "version": "4.2.11", + "resolved": "https://registry.npmjs.org/graceful-fs/-/graceful-fs-4.2.11.tgz", + "integrity": "sha512-RbJ5/jmFcNNCcDV5o9eTnBLJ/HszWV0P73bc+Ff4nS/rJj+YaS6IGyiOL0VoBYX+l1Wrl3k63h/KrH+nhJ0XvQ==", + "dev": true, + "license": "ISC" + }, + "node_modules/hachure-fill": { + "version": "0.5.2", + "resolved": "https://registry.npmjs.org/hachure-fill/-/hachure-fill-0.5.2.tgz", + "integrity": "sha512-3GKBOn+m2LX9iq+JC1064cSFprJY4jL1jCXTcpnfER5HYE2l/4EfWSGzkPa/ZDBmYI0ZOEj5VHV/eKnPGkHuOg==", "dev": true, "license": "MIT" }, - "node_modules/eventsource": { - "version": "3.0.7", - "resolved": "https://registry.npmjs.org/eventsource/-/eventsource-3.0.7.tgz", - "integrity": "sha512-CRT1WTyuQoD771GW56XEZFQ/ZoSfWid1alKGDYMmkt2yl8UXrVR4pspqWNEcqKvVIzg6PAltWjxcSSPrboA4iA==", + "node_modules/has-bigints": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/has-bigints/-/has-bigints-1.1.0.tgz", + "integrity": "sha512-R3pbpkcIqv2Pm3dUwgjclDRVmWpTJW2DcMzcIhEXEx1oh/CEMObMm3KLmRJOdvhM7o4uQBnwr8pzRK2sJWIqfg==", "dev": true, "license": "MIT", - "dependencies": { - "eventsource-parser": "^3.0.1" - }, "engines": { - "node": ">=18.0.0" + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/eventsource-parser": { - "version": "3.0.6", - "resolved": "https://registry.npmjs.org/eventsource-parser/-/eventsource-parser-3.0.6.tgz", - "integrity": "sha512-Vo1ab+QXPzZ4tCa8SwIHJFaSzy4R6SHf7BY79rFBDf0idraZWAkYrDjDj8uWaSm3S2TK+hJ7/t1CEmZ7jXw+pg==", + "node_modules/has-flag": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz", + "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==", "dev": true, "license": "MIT", "engines": { - "node": ">=18.0.0" + "node": ">=8" } }, - "node_modules/expect-type": { - "version": "1.3.0", - "resolved": "https://registry.npmjs.org/expect-type/-/expect-type-1.3.0.tgz", - "integrity": "sha512-knvyeauYhqjOYvQ66MznSMs83wmHrCycNEN6Ao+2AeYEfxUIkuiVxdEa1qlGEPK+We3n0THiDciYSsCcgW/DoA==", + "node_modules/has-property-descriptors": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/has-property-descriptors/-/has-property-descriptors-1.0.2.tgz", + "integrity": "sha512-55JNKuIW+vq4Ke1BjOTjM2YctQIvCT7GFzHwmfZPGo5wnrgkid0YQtnAleFSqumZm4az3n2BS+erby5ipJdgrg==", "dev": true, - "license": "Apache-2.0", - "engines": { - "node": ">=12.0.0" + "license": "MIT", + "dependencies": { + "es-define-property": "^1.0.0" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/express": { - "version": "5.2.1", - "resolved": "https://registry.npmjs.org/express/-/express-5.2.1.tgz", - "integrity": "sha512-hIS4idWWai69NezIdRt2xFVofaF4j+6INOpJlVOLDO8zXGpUVEVzIYk12UUi2JzjEzWL3IOAxcTubgz9Po0yXw==", + "node_modules/has-proto": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/has-proto/-/has-proto-1.2.0.tgz", + "integrity": "sha512-KIL7eQPfHQRC8+XluaIw7BHUwwqL19bQn4hzNgdr+1wXoU0KKj6rufu47lhY7KbJR2C6T6+PfyN0Ea7wkSS+qQ==", "dev": true, "license": "MIT", "dependencies": { - "accepts": "^2.0.0", - "body-parser": "^2.2.1", - "content-disposition": "^1.0.0", - "content-type": "^1.0.5", - "cookie": "^0.7.1", - "cookie-signature": "^1.2.1", - "debug": "^4.4.0", - "depd": "^2.0.0", - "encodeurl": "^2.0.0", - "escape-html": "^1.0.3", - "etag": "^1.8.1", - "finalhandler": "^2.1.0", - "fresh": "^2.0.0", - "http-errors": "^2.0.0", - "merge-descriptors": "^2.0.0", - "mime-types": "^3.0.0", - "on-finished": "^2.4.1", - "once": "^1.4.0", - "parseurl": "^1.3.3", - "proxy-addr": "^2.0.7", - "qs": "^6.14.0", - "range-parser": "^1.2.1", - "router": "^2.2.0", - "send": "^1.1.0", - "serve-static": "^2.2.0", - "statuses": "^2.0.1", - "type-is": "^2.0.1", - "vary": "^1.1.2" + "dunder-proto": "^1.0.0" }, "engines": { - "node": ">= 18" + "node": ">= 0.4" }, "funding": { - "type": "opencollective", - "url": "https://opencollective.com/express" + "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/express-rate-limit": { - "version": "8.5.2", - "resolved": "https://registry.npmjs.org/express-rate-limit/-/express-rate-limit-8.5.2.tgz", - "integrity": "sha512-5Kb34ipNX694DH48vN9irak1Qx30nb0PLYHXfJgw4YEjiC3ZEmZJhwOp+VfiCYwFzvFTdB9QkArYS5kXa2cx2A==", + "node_modules/has-symbols": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/has-symbols/-/has-symbols-1.1.0.tgz", + "integrity": "sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/has-tostringtag": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/has-tostringtag/-/has-tostringtag-1.0.2.tgz", + "integrity": "sha512-NqADB8VjPFLM2V0VvHUewwwsw0ZWBaIdgo+ieHtK3hasLz4qeCRjYcqfB6AQrBggRKppKF8L52/VqdVsO47Dlw==", "dev": true, "license": "MIT", "dependencies": { - "ip-address": "^10.2.0" + "has-symbols": "^1.0.3" }, "engines": { - "node": ">= 16" + "node": ">= 0.4" }, "funding": { - "url": "https://github.com/sponsors/express-rate-limit" - }, - "peerDependencies": { - "express": ">= 4.11" + "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/extend": { - "version": "3.0.2", - "resolved": "https://registry.npmjs.org/extend/-/extend-3.0.2.tgz", - "integrity": "sha512-fjquC59cD7CyW6urNXK0FBufkZcoiGG80wTuPujX590cB5Ttln20E2UB4S/WARVqhXffZl2LNgS+gQdPIIim/g==", + "node_modules/hasown": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/hasown/-/hasown-2.0.4.tgz", + "integrity": "sha512-T2UbfbBEF32wiepXIsMlTW9+dDYC6wMh/t/vYA4tuOMKqWz/n3vr1NFSxQiyP+zk2mXsoMA/i/7qV6LKut1t1A==", "dev": true, - "license": "MIT" + "license": "MIT", + "dependencies": { + "function-bind": "^1.1.2" + }, + "engines": { + "node": ">= 0.4" + } }, - "node_modules/fast-deep-equal": { - "version": "3.1.3", - "resolved": "https://registry.npmjs.org/fast-deep-equal/-/fast-deep-equal-3.1.3.tgz", - "integrity": "sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q==", + "node_modules/hast-util-from-dom": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/hast-util-from-dom/-/hast-util-from-dom-5.0.1.tgz", + "integrity": "sha512-N+LqofjR2zuzTjCPzyDUdSshy4Ma6li7p/c3pA78uTwzFgENbgbUrm2ugwsOdcjI1muO+o6Dgzp9p8WHtn/39Q==", "dev": true, - "license": "MIT" + "license": "ISC", + "dependencies": { + "@types/hast": "^3.0.0", + "hastscript": "^9.0.0", + "web-namespaces": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } }, - "node_modules/fast-json-stable-stringify": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/fast-json-stable-stringify/-/fast-json-stable-stringify-2.1.0.tgz", - "integrity": "sha512-lhd/wF+Lk98HZoTCtlVraHtfh5XYijIjalXck7saUtuanSDyLMxnHhSXEDJqHxD7msR8D0uCmqlkwjCV8xvwHw==", + "node_modules/hast-util-from-html": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/hast-util-from-html/-/hast-util-from-html-2.0.3.tgz", + "integrity": "sha512-CUSRHXyKjzHov8yKsQjGOElXy/3EKpyX56ELnkHH34vDVw1N1XSQ1ZcAvTyAPtGqLTuKP/uxM+aLkSPqF/EtMw==", "dev": true, - "license": "MIT" + "license": "MIT", + "dependencies": { + "@types/hast": "^3.0.0", + "devlop": "^1.1.0", + "hast-util-from-parse5": "^8.0.0", + "parse5": "^7.0.0", + "vfile": "^6.0.0", + "vfile-message": "^4.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } }, - "node_modules/fast-levenshtein": { - "version": "2.0.6", - "resolved": "https://registry.npmjs.org/fast-levenshtein/-/fast-levenshtein-2.0.6.tgz", - "integrity": "sha512-DCXu6Ifhqcks7TZKY3Hxp3y6qphY5SJZmrWMDrKcERSOXWQdMhU9Ig/PYrzyw/ul9jOIyh0N4M0tbC5hodg8dw==", + "node_modules/hast-util-from-html-isomorphic": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/hast-util-from-html-isomorphic/-/hast-util-from-html-isomorphic-2.0.0.tgz", + "integrity": "sha512-zJfpXq44yff2hmE0XmwEOzdWin5xwH+QIhMLOScpX91e/NSGPsAzNCvLQDIEPyO2TXi+lBmU6hjLIhV8MwP2kw==", "dev": true, - "license": "MIT" + "license": "MIT", + "dependencies": { + "@types/hast": "^3.0.0", + "hast-util-from-dom": "^5.0.0", + "hast-util-from-html": "^2.0.0", + "unist-util-remove-position": "^5.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } }, - "node_modules/fast-uri": { - "version": "3.1.2", - "resolved": "https://registry.npmjs.org/fast-uri/-/fast-uri-3.1.2.tgz", - "integrity": "sha512-rVjf7ArG3LTk+FS6Yw81V1DLuZl1bRbNrev6Tmd/9RaroeeRRJhAt7jg/6YFxbvAQXUCavSoZhPPj6oOx+5KjQ==", + "node_modules/hast-util-from-html/node_modules/@types/unist": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.3.tgz", + "integrity": "sha512-ko/gIFJRv177XgZsZcBwnqJN5x/Gien8qNOn0D5bQU/zAzVf9Zt3BlcUiLqhV9y4ARk0GbT3tnUiPNgnTXzc/Q==", "dev": true, - "funding": [ - { - "type": "github", - "url": "https://github.com/sponsors/fastify" - }, - { - "type": "opencollective", - "url": "https://opencollective.com/fastify" - } - ], - "license": "BSD-3-Clause" + "license": "MIT" }, - "node_modules/fdir": { - "version": "6.5.0", - "resolved": "https://registry.npmjs.org/fdir/-/fdir-6.5.0.tgz", - "integrity": "sha512-tIbYtZbucOs0BRGqPJkshJUYdL+SDH7dVM8gjy+ERp3WAUjLEFJE+02kanyHtwjWOnwrKYBiwAmM0p4kLJAnXg==", + "node_modules/hast-util-from-html/node_modules/vfile-message": { + "version": "4.0.3", + "resolved": "https://registry.npmjs.org/vfile-message/-/vfile-message-4.0.3.tgz", + "integrity": "sha512-QTHzsGd1EhbZs4AsQ20JX1rC3cOlt/IWJruk893DfLRr57lcnOeMaWG4K0JrRta4mIJZKth2Au3mM3u03/JWKw==", "dev": true, "license": "MIT", - "engines": { - "node": ">=12.0.0" - }, - "peerDependencies": { - "picomatch": "^3 || ^4" + "dependencies": { + "@types/unist": "^3.0.0", + "unist-util-stringify-position": "^4.0.0" }, - "peerDependenciesMeta": { - "picomatch": { - "optional": true - } + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" } }, - "node_modules/file-entry-cache": { - "version": "8.0.0", - "resolved": "https://registry.npmjs.org/file-entry-cache/-/file-entry-cache-8.0.0.tgz", - "integrity": "sha512-XXTUwCvisa5oacNGRP9SfNtYBNAMi+RPwBFmblZEF7N7swHYQS6/Zfk7SRwx4D5j3CH211YNRco1DEMNVfZCnQ==", + "node_modules/hast-util-from-parse5": { + "version": "8.0.3", + "resolved": "https://registry.npmjs.org/hast-util-from-parse5/-/hast-util-from-parse5-8.0.3.tgz", + "integrity": "sha512-3kxEVkEKt0zvcZ3hCRYI8rqrgwtlIOFMWkbclACvjlDw8Li9S2hk/d51OI0nr/gIpdMHNepwgOKqZ/sy0Clpyg==", "dev": true, "license": "MIT", "dependencies": { - "flat-cache": "^4.0.0" + "@types/hast": "^3.0.0", + "@types/unist": "^3.0.0", + "devlop": "^1.0.0", + "hastscript": "^9.0.0", + "property-information": "^7.0.0", + "vfile": "^6.0.0", + "vfile-location": "^5.0.0", + "web-namespaces": "^2.0.0" }, - "engines": { - "node": ">=16.0.0" + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" } }, - "node_modules/filesize": { - "version": "10.1.6", - "resolved": "https://registry.npmjs.org/filesize/-/filesize-10.1.6.tgz", - "integrity": "sha512-sJslQKU2uM33qH5nqewAwVB2QgR6w1aMNsYUp3aN5rMRyXEwJGmZvaWzeJFNTOXWlHQyBFCWrdj3fV/fsTOX8w==", + "node_modules/hast-util-from-parse5/node_modules/@types/unist": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.3.tgz", + "integrity": "sha512-ko/gIFJRv177XgZsZcBwnqJN5x/Gien8qNOn0D5bQU/zAzVf9Zt3BlcUiLqhV9y4ARk0GbT3tnUiPNgnTXzc/Q==", "dev": true, - "license": "BSD-3-Clause", - "engines": { - "node": ">= 10.4.0" - } + "license": "MIT" }, - "node_modules/fill-range": { - "version": "7.1.1", - "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.1.1.tgz", - "integrity": "sha512-YsGpe3WHLK8ZYi4tWDg2Jy3ebRz2rXowDxnld4bkQB00cc/1Zw9AWnC0i9ztDJitivtQvaI9KaLyKrc+hBW0yg==", + "node_modules/hast-util-is-element": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/hast-util-is-element/-/hast-util-is-element-3.0.0.tgz", + "integrity": "sha512-Val9mnv2IWpLbNPqc/pUem+a7Ipj2aHacCwgNfTiK0vJKl0LF+4Ba4+v1oPHFpf3bLYmreq0/l3Gud9S5OH42g==", "dev": true, "license": "MIT", - "optional": true, "dependencies": { - "to-regex-range": "^5.0.1" + "@types/hast": "^3.0.0" }, - "engines": { - "node": ">=8" + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" } }, - "node_modules/finalhandler": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/finalhandler/-/finalhandler-2.1.1.tgz", - "integrity": "sha512-S8KoZgRZN+a5rNwqTxlZZePjT/4cnm0ROV70LedRHZ0p8u9fRID0hJUZQpkKLzro8LfmC8sx23bY6tVNxv8pQA==", + "node_modules/hast-util-parse-selector": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/hast-util-parse-selector/-/hast-util-parse-selector-4.0.0.tgz", + "integrity": "sha512-wkQCkSYoOGCRKERFWcxMVMOcYE2K1AaNLU8DXS9arxnLOUEWbOXKXiJUNzEpqZ3JOKpnha3jkFrumEjVliDe7A==", "dev": true, "license": "MIT", "dependencies": { - "debug": "^4.4.0", - "encodeurl": "^2.0.0", - "escape-html": "^1.0.3", - "on-finished": "^2.4.1", - "parseurl": "^1.3.3", - "statuses": "^2.0.1" - }, - "engines": { - "node": ">= 18.0.0" + "@types/hast": "^3.0.0" }, "funding": { "type": "opencollective", - "url": "https://opencollective.com/express" + "url": "https://opencollective.com/unified" } }, - "node_modules/find-up": { - "version": "5.0.0", - "resolved": "https://registry.npmjs.org/find-up/-/find-up-5.0.0.tgz", - "integrity": "sha512-78/PXT1wlLLDgTzDs7sjq9hzz0vXD+zn+7wypEe4fXQxCmdmqfGsEPQxmiCSQI3ajFV91bVSsvNtrJRiW6nGng==", + "node_modules/hast-util-sanitize": { + "version": "5.0.2", + "resolved": "https://registry.npmjs.org/hast-util-sanitize/-/hast-util-sanitize-5.0.2.tgz", + "integrity": "sha512-3yTWghByc50aGS7JlGhk61SPenfE/p1oaFeNwkOOyrscaOkMGrcW9+Cy/QAIOBpZxP1yqDIzFMR0+Np0i0+usg==", "dev": true, "license": "MIT", "dependencies": { - "locate-path": "^6.0.0", - "path-exists": "^4.0.0" + "@types/hast": "^3.0.0", + "@ungap/structured-clone": "^1.0.0", + "unist-util-position": "^5.0.0" }, - "engines": { - "node": ">=10" + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/hast-util-to-html": { + "version": "9.0.5", + "resolved": "https://registry.npmjs.org/hast-util-to-html/-/hast-util-to-html-9.0.5.tgz", + "integrity": "sha512-OguPdidb+fbHQSU4Q4ZiLKnzWo8Wwsf5bZfbvu7//a9oTYoqD/fWpe96NuHkoS9h0ccGOTe0C4NGXdtS0iObOw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/hast": "^3.0.0", + "@types/unist": "^3.0.0", + "ccount": "^2.0.0", + "comma-separated-tokens": "^2.0.0", + "hast-util-whitespace": "^3.0.0", + "html-void-elements": "^3.0.0", + "mdast-util-to-hast": "^13.0.0", + "property-information": "^7.0.0", + "space-separated-tokens": "^2.0.0", + "stringify-entities": "^4.0.0", + "zwitch": "^2.0.4" }, "funding": { - "url": "https://github.com/sponsors/sindresorhus" + "type": "opencollective", + "url": "https://opencollective.com/unified" } }, - "node_modules/flat-cache": { - "version": "4.0.1", - "resolved": "https://registry.npmjs.org/flat-cache/-/flat-cache-4.0.1.tgz", - "integrity": "sha512-f7ccFPK3SXFHpx15UIGyRJ/FJQctuKZ0zVuN3frBo4HnK3cay9VEW0R6yPYFHC0AgqhukPzKjq22t5DmAyqGyw==", + "node_modules/hast-util-to-html/node_modules/@types/unist": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.3.tgz", + "integrity": "sha512-ko/gIFJRv177XgZsZcBwnqJN5x/Gien8qNOn0D5bQU/zAzVf9Zt3BlcUiLqhV9y4ARk0GbT3tnUiPNgnTXzc/Q==", + "dev": true, + "license": "MIT" + }, + "node_modules/hast-util-to-text": { + "version": "4.0.2", + "resolved": "https://registry.npmjs.org/hast-util-to-text/-/hast-util-to-text-4.0.2.tgz", + "integrity": "sha512-KK6y/BN8lbaq654j7JgBydev7wuNMcID54lkRav1P0CaE1e47P72AWWPiGKXTJU271ooYzcvTAn/Zt0REnvc7A==", "dev": true, "license": "MIT", "dependencies": { - "flatted": "^3.2.9", - "keyv": "^4.5.4" + "@types/hast": "^3.0.0", + "@types/unist": "^3.0.0", + "hast-util-is-element": "^3.0.0", + "unist-util-find-after": "^5.0.0" }, - "engines": { - "node": ">=16" + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" } }, - "node_modules/flatted": { - "version": "3.4.2", - "resolved": "https://registry.npmjs.org/flatted/-/flatted-3.4.2.tgz", - "integrity": "sha512-PjDse7RzhcPkIJwy5t7KPWQSZ9cAbzQXcafsetQoD7sOJRQlGikNbx7yZp2OotDnJyrDcbyRq3Ttb18iYOqkxA==", + "node_modules/hast-util-to-text/node_modules/@types/unist": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.3.tgz", + "integrity": "sha512-ko/gIFJRv177XgZsZcBwnqJN5x/Gien8qNOn0D5bQU/zAzVf9Zt3BlcUiLqhV9y4ARk0GbT3tnUiPNgnTXzc/Q==", "dev": true, - "license": "ISC" + "license": "MIT" }, - "node_modules/follow-redirects": { - "version": "1.16.0", - "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.16.0.tgz", - "integrity": "sha512-y5rN/uOsadFT/JfYwhxRS5R7Qce+g3zG97+JrtFZlC9klX/W5hD7iiLzScI4nZqUS7DNUdhPgw4xI8W2LuXlUw==", + "node_modules/hast-util-whitespace": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/hast-util-whitespace/-/hast-util-whitespace-3.0.0.tgz", + "integrity": "sha512-88JUN06ipLwsnv+dVn+OIYOvAuvBMy/Qoi6O7mQHxdPXpjy+Cd6xRkWwux7DKO+4sYILtLBRIKgsdpS2gQc7qw==", "dev": true, - "funding": [ - { - "type": "individual", - "url": "https://github.com/sponsors/RubenVerborgh" - } - ], "license": "MIT", - "engines": { - "node": ">=4.0" + "dependencies": { + "@types/hast": "^3.0.0" }, - "peerDependenciesMeta": { - "debug": { - "optional": true - } + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" } }, - "node_modules/forwarded": { - "version": "0.2.0", - "resolved": "https://registry.npmjs.org/forwarded/-/forwarded-0.2.0.tgz", - "integrity": "sha512-buRG0fpBtRHSTCOASe6hD258tEubFoRLb4ZNA6NxMVHNw2gOcwHo9wyablzMzOA5z9xA9L1KNjk/Nt6MT9aYow==", + "node_modules/hastscript": { + "version": "9.0.1", + "resolved": "https://registry.npmjs.org/hastscript/-/hastscript-9.0.1.tgz", + "integrity": "sha512-g7df9rMFX/SPi34tyGCyUBREQoKkapwdY/T04Qn9TDWfHhAYt4/I0gMVirzK5wEzeUqIjEB+LXC/ypb7Aqno5w==", "dev": true, "license": "MIT", - "engines": { - "node": ">= 0.6" + "dependencies": { + "@types/hast": "^3.0.0", + "comma-separated-tokens": "^2.0.0", + "hast-util-parse-selector": "^4.0.0", + "property-information": "^7.0.0", + "space-separated-tokens": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" } }, - "node_modules/fresh": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/fresh/-/fresh-2.0.0.tgz", - "integrity": "sha512-Rx/WycZ60HOaqLKAi6cHRKKI7zxWbJ31MhntmtwMoaTeF7XFH9hhBp8vITaMidfljRQ6eYWCKkaTK+ykVJHP2A==", + "node_modules/he": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/he/-/he-1.2.0.tgz", + "integrity": "sha512-F/1DnUGPopORZi0ni+CvrCgHQ5FyEAHRLSApuYWMmrbSwoN2Mn/7k+Gl38gJnR7yyDZk6WLXwiGod1JOWNDKGw==", "dev": true, "license": "MIT", - "engines": { - "node": ">= 0.8" + "bin": { + "he": "bin/he" } }, - "node_modules/fsevents": { - "version": "2.3.2", - "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz", - "integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==", + "node_modules/highlight.js": { + "version": "11.11.1", + "resolved": "https://registry.npmjs.org/highlight.js/-/highlight.js-11.11.1.tgz", + "integrity": "sha512-Xwwo44whKBVCYoliBQwaPvtd/2tYFkRQtXDWj1nackaV2JPXx3L0+Jvd8/qCJ2p+ML0/XVkJ2q+Mr+UVdpJK5w==", "dev": true, - "hasInstallScript": true, - "license": "MIT", - "optional": true, - "os": [ - "darwin" - ], + "license": "BSD-3-Clause", "engines": { - "node": "^8.16.0 || ^10.6.0 || >=11.0.0" - } - }, - "node_modules/function-bind": { - "version": "1.1.2", - "resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.2.tgz", - "integrity": "sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==", - "dev": true, - "license": "MIT", - "funding": { - "url": "https://github.com/sponsors/ljharb" + "node": ">=12.0.0" } }, - "node_modules/get-intrinsic": { - "version": "1.3.0", - "resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.3.0.tgz", - "integrity": "sha512-9fSjSaos/fRIVIp+xSJlE6lfwhES7LNtKaCBIamHsjr2na1BiABJPo0mOjjz8GJDURarmCPGqaiVg5mfjb98CQ==", + "node_modules/hono": { + "version": "4.12.26", + "resolved": "https://registry.npmjs.org/hono/-/hono-4.12.26.tgz", + "integrity": "sha512-uyZtpnYxM9CmQ7QsQknM4zN8EftNqhON1qYeIKM0Se67CCEe2c44xyGURwB0axX2fBDu1dqHrHAc1hmNT8ITkw==", "dev": true, "license": "MIT", - "dependencies": { - "call-bind-apply-helpers": "^1.0.2", - "es-define-property": "^1.0.1", - "es-errors": "^1.3.0", - "es-object-atoms": "^1.1.1", - "function-bind": "^1.1.2", - "get-proto": "^1.0.1", - "gopd": "^1.2.0", - "has-symbols": "^1.1.0", - "hasown": "^2.0.2", - "math-intrinsics": "^1.1.0" - }, "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" + "node": ">=16.9.0" } }, - "node_modules/get-proto": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/get-proto/-/get-proto-1.0.1.tgz", - "integrity": "sha512-sTSfBjoXBp89JvIKIefqw7U2CCebsc74kiY6awiGogKtoSGbgjYE/G/+l9sF3MWFPNc9IcoOC4ODfKHfxFmp0g==", + "node_modules/html-encoding-sniffer": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/html-encoding-sniffer/-/html-encoding-sniffer-3.0.0.tgz", + "integrity": "sha512-oWv4T4yJ52iKrufjnyZPkrN0CH3QnrUqdB6In1g5Fe1mia8GmF36gnfNySxoZtxD5+NmYw1EElVXiBk93UeskA==", "dev": true, "license": "MIT", "dependencies": { - "dunder-proto": "^1.0.1", - "es-object-atoms": "^1.0.0" + "whatwg-encoding": "^2.0.0" }, "engines": { - "node": ">= 0.4" + "node": ">=12" } }, - "node_modules/glob-parent": { - "version": "6.0.2", - "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-6.0.2.tgz", - "integrity": "sha512-XxwI8EOhVQgWp6iDL+3b0r86f4d6AX6zSU55HfB4ydCEuXLXc5FcYeOu+nnGftS4TEju/11rt4KJPTMgbfmv4A==", + "node_modules/html-escaper": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/html-escaper/-/html-escaper-2.0.2.tgz", + "integrity": "sha512-H2iMtd0I4Mt5eYiapRdIDjp+XzelXQ0tFE4JS7YFwFevXXMmOp9myNrUvCg0D6ws8iqkRPBfKHgbwig1SmlLfg==", "dev": true, - "license": "ISC", - "dependencies": { - "is-glob": "^4.0.3" - }, - "engines": { - "node": ">=10.13.0" - } + "license": "MIT" }, - "node_modules/globals": { - "version": "16.3.0", - "resolved": "https://registry.npmjs.org/globals/-/globals-16.3.0.tgz", - "integrity": "sha512-bqWEnJ1Nt3neqx2q5SFfGS8r/ahumIakg3HcwtNlrVlwXIeNumWn/c7Pn/wKzGhf6SaW6H6uWXLqC30STCMchQ==", + "node_modules/html-void-elements": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/html-void-elements/-/html-void-elements-3.0.0.tgz", + "integrity": "sha512-bEqo66MRXsUGxWHV5IP0PUiAWwoEjba4VCzg0LjFJBpchPaTfyfCKTG6bc5F8ucKec3q5y6qOdGyYTSBEvhCrg==", "dev": true, "license": "MIT", - "engines": { - "node": ">=18" - }, "funding": { - "url": "https://github.com/sponsors/sindresorhus" + "type": "github", + "url": "https://github.com/sponsors/wooorm" } }, - "node_modules/gopd": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/gopd/-/gopd-1.2.0.tgz", - "integrity": "sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg==", + "node_modules/http-errors": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/http-errors/-/http-errors-2.0.1.tgz", + "integrity": "sha512-4FbRdAX+bSdmo4AUFuS0WNiPz8NgFt+r8ThgNWmlrjQjt1Q7ZR9+zTlce2859x4KSXrwIsaeTqDoKQmtP8pLmQ==", "dev": true, "license": "MIT", + "dependencies": { + "depd": "~2.0.0", + "inherits": "~2.0.4", + "setprototypeof": "~1.2.0", + "statuses": "~2.0.2", + "toidentifier": "~1.0.1" + }, "engines": { - "node": ">= 0.4" + "node": ">= 0.8" }, "funding": { - "url": "https://github.com/sponsors/ljharb" + "type": "opencollective", + "url": "https://opencollective.com/express" } }, - "node_modules/graceful-fs": { - "version": "4.2.11", - "resolved": "https://registry.npmjs.org/graceful-fs/-/graceful-fs-4.2.11.tgz", - "integrity": "sha512-RbJ5/jmFcNNCcDV5o9eTnBLJ/HszWV0P73bc+Ff4nS/rJj+YaS6IGyiOL0VoBYX+l1Wrl3k63h/KrH+nhJ0XvQ==", - "dev": true, - "license": "ISC" - }, - "node_modules/hachure-fill": { - "version": "0.5.2", - "resolved": "https://registry.npmjs.org/hachure-fill/-/hachure-fill-0.5.2.tgz", - "integrity": "sha512-3GKBOn+m2LX9iq+JC1064cSFprJY4jL1jCXTcpnfER5HYE2l/4EfWSGzkPa/ZDBmYI0ZOEj5VHV/eKnPGkHuOg==", - "dev": true, - "license": "MIT" - }, - "node_modules/has-flag": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz", - "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==", + "node_modules/http-proxy": { + "version": "1.18.1", + "resolved": "https://registry.npmjs.org/http-proxy/-/http-proxy-1.18.1.tgz", + "integrity": "sha512-7mz/721AbnJwIVbnaSv1Cz3Am0ZLT/UBwkC92VlxhXv/k/BBQfM2fXElQNC27BVGr0uwUpplYPQM9LnaBMR5NQ==", "dev": true, "license": "MIT", + "dependencies": { + "eventemitter3": "^4.0.0", + "follow-redirects": "^1.0.0", + "requires-port": "^1.0.0" + }, "engines": { - "node": ">=8" + "node": ">=8.0.0" } }, - "node_modules/has-symbols": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/has-symbols/-/has-symbols-1.1.0.tgz", - "integrity": "sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ==", + "node_modules/http-server": { + "version": "14.1.1", + "resolved": "https://registry.npmjs.org/http-server/-/http-server-14.1.1.tgz", + "integrity": "sha512-+cbxadF40UXd9T01zUHgA+rlo2Bg1Srer4+B4NwIHdaGxAGGv59nYRnGGDJ9LBk7alpS0US+J+bLLdQOOkJq4A==", "dev": true, "license": "MIT", - "engines": { - "node": ">= 0.4" + "dependencies": { + "basic-auth": "^2.0.1", + "chalk": "^4.1.2", + "corser": "^2.0.1", + "he": "^1.2.0", + "html-encoding-sniffer": "^3.0.0", + "http-proxy": "^1.18.1", + "mime": "^1.6.0", + "minimist": "^1.2.6", + "opener": "^1.5.1", + "portfinder": "^1.0.28", + "secure-compare": "3.0.1", + "union": "~0.5.0", + "url-join": "^4.0.1" }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, - "node_modules/hasown": { - "version": "2.0.2", - "resolved": "https://registry.npmjs.org/hasown/-/hasown-2.0.2.tgz", - "integrity": "sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==", - "dev": true, - "license": "MIT", - "dependencies": { - "function-bind": "^1.1.2" + "bin": { + "http-server": "bin/http-server" }, "engines": { - "node": ">= 0.4" + "node": ">=12" } }, - "node_modules/hast-util-from-dom": { - "version": "5.0.1", - "resolved": "https://registry.npmjs.org/hast-util-from-dom/-/hast-util-from-dom-5.0.1.tgz", - "integrity": "sha512-N+LqofjR2zuzTjCPzyDUdSshy4Ma6li7p/c3pA78uTwzFgENbgbUrm2ugwsOdcjI1muO+o6Dgzp9p8WHtn/39Q==", + "node_modules/ico-endec": { + "version": "0.1.6", + "resolved": "https://registry.npmjs.org/ico-endec/-/ico-endec-0.1.6.tgz", + "integrity": "sha512-ZdLU38ZoED3g1j3iEyzcQj+wAkY2xfWNkymszfJPoxucIUhK7NayQ+/C4Kv0nDFMIsbtbEHldv3V8PU494/ueQ==", "dev": true, - "license": "ISC", - "dependencies": { - "@types/hast": "^3.0.0", - "hastscript": "^9.0.0", - "web-namespaces": "^2.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } + "license": "MPL-2.0" }, - "node_modules/hast-util-from-html": { - "version": "2.0.3", - "resolved": "https://registry.npmjs.org/hast-util-from-html/-/hast-util-from-html-2.0.3.tgz", - "integrity": "sha512-CUSRHXyKjzHov8yKsQjGOElXy/3EKpyX56ELnkHH34vDVw1N1XSQ1ZcAvTyAPtGqLTuKP/uxM+aLkSPqF/EtMw==", + "node_modules/iconv-lite": { + "version": "0.7.2", + "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.7.2.tgz", + "integrity": "sha512-im9DjEDQ55s9fL4EYzOAv0yMqmMBSZp6G0VvFyTMPKWxiSBHUj9NW/qqLmXUwXrrM7AvqSlTCfvqRb0cM8yYqw==", "dev": true, "license": "MIT", "dependencies": { - "@types/hast": "^3.0.0", - "devlop": "^1.1.0", - "hast-util-from-parse5": "^8.0.0", - "parse5": "^7.0.0", - "vfile": "^6.0.0", - "vfile-message": "^4.0.0" + "safer-buffer": ">= 2.1.2 < 3.0.0" + }, + "engines": { + "node": ">=0.10.0" }, "funding": { "type": "opencollective", - "url": "https://opencollective.com/unified" + "url": "https://opencollective.com/express" } }, - "node_modules/hast-util-from-html-isomorphic": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/hast-util-from-html-isomorphic/-/hast-util-from-html-isomorphic-2.0.0.tgz", - "integrity": "sha512-zJfpXq44yff2hmE0XmwEOzdWin5xwH+QIhMLOScpX91e/NSGPsAzNCvLQDIEPyO2TXi+lBmU6hjLIhV8MwP2kw==", + "node_modules/idb": { + "version": "7.1.1", + "resolved": "https://registry.npmjs.org/idb/-/idb-7.1.1.tgz", + "integrity": "sha512-gchesWBzyvGHRO9W8tzUWFDycow5gwjvFKfyV9FF32Y7F50yZMp7mP+T2mJIWFx49zicqyC4uefHM17o6xKIVQ==", + "dev": true, + "license": "ISC" + }, + "node_modules/ignore": { + "version": "5.3.2", + "resolved": "https://registry.npmjs.org/ignore/-/ignore-5.3.2.tgz", + "integrity": "sha512-hsBTNUqQTDwkWtcdYI2i06Y/nUBEsNEDJKjWdigLvegy8kDuJAS8uRlpkkcQpyEXL0Z/pjDy5HBmMjRCJ2gq+g==", "dev": true, "license": "MIT", - "dependencies": { - "@types/hast": "^3.0.0", - "hast-util-from-dom": "^5.0.0", - "hast-util-from-html": "^2.0.0", - "unist-util-remove-position": "^5.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" + "engines": { + "node": ">= 4" } }, - "node_modules/hast-util-from-html/node_modules/@types/unist": { - "version": "3.0.3", - "resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.3.tgz", - "integrity": "sha512-ko/gIFJRv177XgZsZcBwnqJN5x/Gien8qNOn0D5bQU/zAzVf9Zt3BlcUiLqhV9y4ARk0GbT3tnUiPNgnTXzc/Q==", + "node_modules/immutable": { + "version": "5.1.6", + "resolved": "https://registry.npmjs.org/immutable/-/immutable-5.1.6.tgz", + "integrity": "sha512-q1swsS8K7L8usSHuOqF2TAoCCkonYz0SG38wLAggaa4Wml70zixIvt2ql4coQ2C2B3hTjltJry4r6bULwgAXLQ==", "dev": true, "license": "MIT" }, - "node_modules/hast-util-from-html/node_modules/unist-util-stringify-position": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/unist-util-stringify-position/-/unist-util-stringify-position-4.0.0.tgz", - "integrity": "sha512-0ASV06AAoKCDkS2+xw5RXJywruurpbC4JZSm7nr7MOt1ojAzvyyaO+UxZf18j8FCF6kmzCZKcAgN/yu2gm2XgQ==", + "node_modules/import-fresh": { + "version": "3.3.1", + "resolved": "https://registry.npmjs.org/import-fresh/-/import-fresh-3.3.1.tgz", + "integrity": "sha512-TR3KfrTZTYLPB6jUjfx6MF9WcWrHL9su5TObK4ZkYgBdWKPOFoSoQIdEuTuR82pmtxH2spWG9h6etwfr1pLBqQ==", "dev": true, "license": "MIT", "dependencies": { - "@types/unist": "^3.0.0" + "parent-module": "^1.0.0", + "resolve-from": "^4.0.0" + }, + "engines": { + "node": ">=6" }, "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" + "url": "https://github.com/sponsors/sindresorhus" } }, - "node_modules/hast-util-from-html/node_modules/vfile-message": { - "version": "4.0.3", - "resolved": "https://registry.npmjs.org/vfile-message/-/vfile-message-4.0.3.tgz", - "integrity": "sha512-QTHzsGd1EhbZs4AsQ20JX1rC3cOlt/IWJruk893DfLRr57lcnOeMaWG4K0JrRta4mIJZKth2Au3mM3u03/JWKw==", + "node_modules/import-meta-resolve": { + "version": "4.2.0", + "resolved": "https://registry.npmjs.org/import-meta-resolve/-/import-meta-resolve-4.2.0.tgz", + "integrity": "sha512-Iqv2fzaTQN28s/FwZAoFq0ZSs/7hMAHJVX+w8PZl3cY19Pxk6jFFalxQoIfW2826i/fDLXv8IiEZRIT0lDuWcg==", "dev": true, "license": "MIT", - "dependencies": { - "@types/unist": "^3.0.0", - "unist-util-stringify-position": "^4.0.0" - }, "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" + "type": "github", + "url": "https://github.com/sponsors/wooorm" } }, - "node_modules/hast-util-from-parse5": { - "version": "8.0.3", - "resolved": "https://registry.npmjs.org/hast-util-from-parse5/-/hast-util-from-parse5-8.0.3.tgz", - "integrity": "sha512-3kxEVkEKt0zvcZ3hCRYI8rqrgwtlIOFMWkbclACvjlDw8Li9S2hk/d51OI0nr/gIpdMHNepwgOKqZ/sy0Clpyg==", + "node_modules/imurmurhash": { + "version": "0.1.4", + "resolved": "https://registry.npmjs.org/imurmurhash/-/imurmurhash-0.1.4.tgz", + "integrity": "sha512-JmXMZ6wuvDmLiHEml9ykzqO6lwFbof0GG4IkcGaENdCRDDmMVnny7s5HsIgHCbaq0w2MyPhDqkhTUgS2LU2PHA==", "dev": true, "license": "MIT", - "dependencies": { - "@types/hast": "^3.0.0", - "@types/unist": "^3.0.0", - "devlop": "^1.0.0", - "hastscript": "^9.0.0", - "property-information": "^7.0.0", - "vfile": "^6.0.0", - "vfile-location": "^5.0.0", - "web-namespaces": "^2.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" + "engines": { + "node": ">=0.8.19" } }, - "node_modules/hast-util-from-parse5/node_modules/@types/unist": { - "version": "3.0.3", - "resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.3.tgz", - "integrity": "sha512-ko/gIFJRv177XgZsZcBwnqJN5x/Gien8qNOn0D5bQU/zAzVf9Zt3BlcUiLqhV9y4ARk0GbT3tnUiPNgnTXzc/Q==", + "node_modules/indent-string": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/indent-string/-/indent-string-4.0.0.tgz", + "integrity": "sha512-EdDDZu4A2OyIK7Lr/2zG+w5jmbuk1DVBnEwREQvBzspBJkCEbRa8GxU1lghYcaGJCnRWibjDXlq779X1/y5xwg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=8" + } + }, + "node_modules/inherits": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz", + "integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==", + "dev": true, + "license": "ISC" + }, + "node_modules/inline-style-parser": { + "version": "0.2.7", + "resolved": "https://registry.npmjs.org/inline-style-parser/-/inline-style-parser-0.2.7.tgz", + "integrity": "sha512-Nb2ctOyNR8DqQoR0OwRG95uNWIC0C1lCgf5Naz5H6Ji72KZ8OcFZLz2P5sNgwlyoJ8Yif11oMuYs5pBQa86csA==", "dev": true, "license": "MIT" }, - "node_modules/hast-util-is-element": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/hast-util-is-element/-/hast-util-is-element-3.0.0.tgz", - "integrity": "sha512-Val9mnv2IWpLbNPqc/pUem+a7Ipj2aHacCwgNfTiK0vJKl0LF+4Ba4+v1oPHFpf3bLYmreq0/l3Gud9S5OH42g==", + "node_modules/internal-slot": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/internal-slot/-/internal-slot-1.1.0.tgz", + "integrity": "sha512-4gd7VpWNQNB4UKKCFFVcp1AVv+FMOgs9NKzjHKusc8jTMhd5eL1NqQqOpE0KzMds804/yHlglp3uxgluOqAPLw==", "dev": true, "license": "MIT", "dependencies": { - "@types/hast": "^3.0.0" + "es-errors": "^1.3.0", + "hasown": "^2.0.2", + "side-channel": "^1.1.0" }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" + "engines": { + "node": ">= 0.4" } }, - "node_modules/hast-util-parse-selector": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/hast-util-parse-selector/-/hast-util-parse-selector-4.0.0.tgz", - "integrity": "sha512-wkQCkSYoOGCRKERFWcxMVMOcYE2K1AaNLU8DXS9arxnLOUEWbOXKXiJUNzEpqZ3JOKpnha3jkFrumEjVliDe7A==", + "node_modules/internmap": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/internmap/-/internmap-2.0.3.tgz", + "integrity": "sha512-5Hh7Y1wQbvY5ooGgPbDaL5iYLAPzMTUrjMulskHLH6wnv/A+1q5rgEaiuqEjB+oxGXIVZs1FF+R/KPN3ZSQYYg==", "dev": true, - "license": "MIT", - "dependencies": { - "@types/hast": "^3.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" + "license": "ISC", + "engines": { + "node": ">=12" } }, - "node_modules/hast-util-sanitize": { - "version": "5.0.2", - "resolved": "https://registry.npmjs.org/hast-util-sanitize/-/hast-util-sanitize-5.0.2.tgz", - "integrity": "sha512-3yTWghByc50aGS7JlGhk61SPenfE/p1oaFeNwkOOyrscaOkMGrcW9+Cy/QAIOBpZxP1yqDIzFMR0+Np0i0+usg==", + "node_modules/ip-address": { + "version": "10.2.0", + "resolved": "https://registry.npmjs.org/ip-address/-/ip-address-10.2.0.tgz", + "integrity": "sha512-/+S6j4E9AHvW9SWMSEY9Xfy66O5PWvVEJ08O0y5JGyEKQpojb0K0GKpz/v5HJ/G0vi3D2sjGK78119oXZeE0qA==", "dev": true, "license": "MIT", - "dependencies": { - "@types/hast": "^3.0.0", - "@ungap/structured-clone": "^1.0.0", - "unist-util-position": "^5.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" + "engines": { + "node": ">= 12" } }, - "node_modules/hast-util-to-html": { - "version": "9.0.5", - "resolved": "https://registry.npmjs.org/hast-util-to-html/-/hast-util-to-html-9.0.5.tgz", - "integrity": "sha512-OguPdidb+fbHQSU4Q4ZiLKnzWo8Wwsf5bZfbvu7//a9oTYoqD/fWpe96NuHkoS9h0ccGOTe0C4NGXdtS0iObOw==", + "node_modules/ipaddr.js": { + "version": "1.9.1", + "resolved": "https://registry.npmjs.org/ipaddr.js/-/ipaddr.js-1.9.1.tgz", + "integrity": "sha512-0KI/607xoxSToH7GjN1FfSbLoU0+btTicjsQSWQlh/hZykN8KpmMf7uYwPW3R+akZ6R/w18ZlXSHBYXiYUPO3g==", "dev": true, "license": "MIT", - "dependencies": { - "@types/hast": "^3.0.0", - "@types/unist": "^3.0.0", - "ccount": "^2.0.0", - "comma-separated-tokens": "^2.0.0", - "hast-util-whitespace": "^3.0.0", - "html-void-elements": "^3.0.0", - "mdast-util-to-hast": "^13.0.0", - "property-information": "^7.0.0", - "space-separated-tokens": "^2.0.0", - "stringify-entities": "^4.0.0", - "zwitch": "^2.0.4" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" + "engines": { + "node": ">= 0.10" } }, - "node_modules/hast-util-to-html/node_modules/@types/unist": { - "version": "3.0.3", - "resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.3.tgz", - "integrity": "sha512-ko/gIFJRv177XgZsZcBwnqJN5x/Gien8qNOn0D5bQU/zAzVf9Zt3BlcUiLqhV9y4ARk0GbT3tnUiPNgnTXzc/Q==", - "dev": true, - "license": "MIT" - }, - "node_modules/hast-util-to-text": { - "version": "4.0.2", - "resolved": "https://registry.npmjs.org/hast-util-to-text/-/hast-util-to-text-4.0.2.tgz", - "integrity": "sha512-KK6y/BN8lbaq654j7JgBydev7wuNMcID54lkRav1P0CaE1e47P72AWWPiGKXTJU271ooYzcvTAn/Zt0REnvc7A==", + "node_modules/is-array-buffer": { + "version": "3.0.5", + "resolved": "https://registry.npmjs.org/is-array-buffer/-/is-array-buffer-3.0.5.tgz", + "integrity": "sha512-DDfANUiiG2wC1qawP66qlTugJeL5HyzMpfr8lLK+jMQirGzNod0B12cFB/9q838Ru27sBwfw78/rdoU7RERz6A==", "dev": true, "license": "MIT", "dependencies": { - "@types/hast": "^3.0.0", - "@types/unist": "^3.0.0", - "hast-util-is-element": "^3.0.0", - "unist-util-find-after": "^5.0.0" + "call-bind": "^1.0.8", + "call-bound": "^1.0.3", + "get-intrinsic": "^1.2.6" + }, + "engines": { + "node": ">= 0.4" }, "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } - }, - "node_modules/hast-util-to-text/node_modules/@types/unist": { - "version": "3.0.3", - "resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.3.tgz", - "integrity": "sha512-ko/gIFJRv177XgZsZcBwnqJN5x/Gien8qNOn0D5bQU/zAzVf9Zt3BlcUiLqhV9y4ARk0GbT3tnUiPNgnTXzc/Q==", + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/is-arrayish": { + "version": "0.3.4", + "resolved": "https://registry.npmjs.org/is-arrayish/-/is-arrayish-0.3.4.tgz", + "integrity": "sha512-m6UrgzFVUYawGBh1dUsWR5M2Clqic9RVXC/9f8ceNlv2IcO9j9J/z8UoCLPqtsPBFNzEpfR3xftohbfqDx8EQA==", "dev": true, "license": "MIT" }, - "node_modules/hast-util-whitespace": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/hast-util-whitespace/-/hast-util-whitespace-3.0.0.tgz", - "integrity": "sha512-88JUN06ipLwsnv+dVn+OIYOvAuvBMy/Qoi6O7mQHxdPXpjy+Cd6xRkWwux7DKO+4sYILtLBRIKgsdpS2gQc7qw==", + "node_modules/is-async-function": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/is-async-function/-/is-async-function-2.1.1.tgz", + "integrity": "sha512-9dgM/cZBnNvjzaMYHVoxxfPj2QXt22Ev7SuuPrs+xav0ukGB0S6d4ydZdEiM48kLx5kDV+QBPrpVnFyefL8kkQ==", "dev": true, "license": "MIT", "dependencies": { - "@types/hast": "^3.0.0" + "async-function": "^1.0.0", + "call-bound": "^1.0.3", + "get-proto": "^1.0.1", + "has-tostringtag": "^1.0.2", + "safe-regex-test": "^1.1.0" + }, + "engines": { + "node": ">= 0.4" }, "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" + "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/hastscript": { - "version": "9.0.1", - "resolved": "https://registry.npmjs.org/hastscript/-/hastscript-9.0.1.tgz", - "integrity": "sha512-g7df9rMFX/SPi34tyGCyUBREQoKkapwdY/T04Qn9TDWfHhAYt4/I0gMVirzK5wEzeUqIjEB+LXC/ypb7Aqno5w==", + "node_modules/is-bigint": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/is-bigint/-/is-bigint-1.1.0.tgz", + "integrity": "sha512-n4ZT37wG78iz03xPRKJrHTdZbe3IicyucEtdRsV5yglwc3GyUfbAfpSeD0FJ41NbUNSt5wbhqfp1fS+BgnvDFQ==", "dev": true, "license": "MIT", "dependencies": { - "@types/hast": "^3.0.0", - "comma-separated-tokens": "^2.0.0", - "hast-util-parse-selector": "^4.0.0", - "property-information": "^7.0.0", - "space-separated-tokens": "^2.0.0" + "has-bigints": "^1.0.2" + }, + "engines": { + "node": ">= 0.4" }, "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" + "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/he": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/he/-/he-1.2.0.tgz", - "integrity": "sha512-F/1DnUGPopORZi0ni+CvrCgHQ5FyEAHRLSApuYWMmrbSwoN2Mn/7k+Gl38gJnR7yyDZk6WLXwiGod1JOWNDKGw==", + "node_modules/is-boolean-object": { + "version": "1.2.2", + "resolved": "https://registry.npmjs.org/is-boolean-object/-/is-boolean-object-1.2.2.tgz", + "integrity": "sha512-wa56o2/ElJMYqjCjGkXri7it5FbebW5usLw/nPmCMs5DeZ7eziSYZhSmPRn0txqeW4LnAmQQU7FgqLpsEFKM4A==", "dev": true, "license": "MIT", - "bin": { - "he": "bin/he" + "dependencies": { + "call-bound": "^1.0.3", + "has-tostringtag": "^1.0.2" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/highlight.js": { - "version": "11.11.1", - "resolved": "https://registry.npmjs.org/highlight.js/-/highlight.js-11.11.1.tgz", - "integrity": "sha512-Xwwo44whKBVCYoliBQwaPvtd/2tYFkRQtXDWj1nackaV2JPXx3L0+Jvd8/qCJ2p+ML0/XVkJ2q+Mr+UVdpJK5w==", + "node_modules/is-callable": { + "version": "1.2.7", + "resolved": "https://registry.npmjs.org/is-callable/-/is-callable-1.2.7.tgz", + "integrity": "sha512-1BC0BVFhS/p0qtw6enp8e+8OD0UrK0oFLztSjNzhcKA3WDuJxxAPXzPuPtKkjEY9UUoEWlX/8fgKeu2S8i9JTA==", "dev": true, - "license": "BSD-3-Clause", + "license": "MIT", "engines": { - "node": ">=12.0.0" + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/hono": { - "version": "4.12.19", - "resolved": "https://registry.npmjs.org/hono/-/hono-4.12.19.tgz", - "integrity": "sha512-xa3eYXYXx68XTT4hZ7dRzsXBhaq85ToSrlUJNoR0gwz/1Ap/CNwX47wfvV7pc/xWhjKVVkLT7zBJy8chhNguqQ==", + "node_modules/is-core-module": { + "version": "2.16.2", + "resolved": "https://registry.npmjs.org/is-core-module/-/is-core-module-2.16.2.tgz", + "integrity": "sha512-evOr8xfXKxE6qSR0hSXL2r3sd7ALj8+7jQEUvPYcm5sgZFdJ+AYzT6yNmJenvIYQBgIGwfwz08sL8zoL7yq2BA==", "dev": true, "license": "MIT", + "dependencies": { + "hasown": "^2.0.3" + }, "engines": { - "node": ">=16.9.0" + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/html-encoding-sniffer": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/html-encoding-sniffer/-/html-encoding-sniffer-3.0.0.tgz", - "integrity": "sha512-oWv4T4yJ52iKrufjnyZPkrN0CH3QnrUqdB6In1g5Fe1mia8GmF36gnfNySxoZtxD5+NmYw1EElVXiBk93UeskA==", + "node_modules/is-data-view": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/is-data-view/-/is-data-view-1.0.2.tgz", + "integrity": "sha512-RKtWF8pGmS87i2D6gqQu/l7EYRlVdfzemCJN/P3UOs//x1QE7mfhvzHIApBTRf7axvT6DMGwSwBXYCT0nfB9xw==", "dev": true, "license": "MIT", "dependencies": { - "whatwg-encoding": "^2.0.0" + "call-bound": "^1.0.2", + "get-intrinsic": "^1.2.6", + "is-typed-array": "^1.1.13" }, "engines": { - "node": ">=12" + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/html-escaper": { - "version": "2.0.2", - "resolved": "https://registry.npmjs.org/html-escaper/-/html-escaper-2.0.2.tgz", - "integrity": "sha512-H2iMtd0I4Mt5eYiapRdIDjp+XzelXQ0tFE4JS7YFwFevXXMmOp9myNrUvCg0D6ws8iqkRPBfKHgbwig1SmlLfg==", + "node_modules/is-date-object": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/is-date-object/-/is-date-object-1.1.0.tgz", + "integrity": "sha512-PwwhEakHVKTdRNVOw+/Gyh0+MzlCl4R6qKvkhuvLtPMggI1WAHt9sOwZxQLSGpUaDnrdyDsomoRgNnCfKNSXXg==", "dev": true, - "license": "MIT" + "license": "MIT", + "dependencies": { + "call-bound": "^1.0.2", + "has-tostringtag": "^1.0.2" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } }, - "node_modules/html-void-elements": { + "node_modules/is-docker": { "version": "3.0.0", - "resolved": "https://registry.npmjs.org/html-void-elements/-/html-void-elements-3.0.0.tgz", - "integrity": "sha512-bEqo66MRXsUGxWHV5IP0PUiAWwoEjba4VCzg0LjFJBpchPaTfyfCKTG6bc5F8ucKec3q5y6qOdGyYTSBEvhCrg==", + "resolved": "https://registry.npmjs.org/is-docker/-/is-docker-3.0.0.tgz", + "integrity": "sha512-eljcgEDlEns/7AXFosB5K/2nCM4P7FQPkGc/DWLy5rmFEWvZayGrik1d9/QIY5nJ4f9YsVvBkA6kJpHn9rISdQ==", "dev": true, "license": "MIT", + "bin": { + "is-docker": "cli.js" + }, + "engines": { + "node": "^12.20.0 || ^14.13.1 || >=16.0.0" + }, "funding": { - "type": "github", - "url": "https://github.com/sponsors/wooorm" + "url": "https://github.com/sponsors/sindresorhus" } }, - "node_modules/http-errors": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/http-errors/-/http-errors-2.0.1.tgz", - "integrity": "sha512-4FbRdAX+bSdmo4AUFuS0WNiPz8NgFt+r8ThgNWmlrjQjt1Q7ZR9+zTlce2859x4KSXrwIsaeTqDoKQmtP8pLmQ==", + "node_modules/is-extglob": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-2.1.1.tgz", + "integrity": "sha512-SbKbANkN603Vi4jEZv49LeVJMn4yGwsbzZworEoyEiutsN3nJYdbO36zfhGJ6QEDpOZIFkDtnq5JRxmvl3jsoQ==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/is-finalizationregistry": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/is-finalizationregistry/-/is-finalizationregistry-1.1.1.tgz", + "integrity": "sha512-1pC6N8qWJbWoPtEjgcL2xyhQOP491EQjeUo3qTKcmV8YSDDJrOepfG8pcC7h/QgnQHYSv0mJ3Z/ZWxmatVrysg==", "dev": true, "license": "MIT", "dependencies": { - "depd": "~2.0.0", - "inherits": "~2.0.4", - "setprototypeof": "~1.2.0", - "statuses": "~2.0.2", - "toidentifier": "~1.0.1" + "call-bound": "^1.0.3" }, "engines": { - "node": ">= 0.8" + "node": ">= 0.4" }, "funding": { - "type": "opencollective", - "url": "https://opencollective.com/express" + "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/http-proxy": { - "version": "1.18.1", - "resolved": "https://registry.npmjs.org/http-proxy/-/http-proxy-1.18.1.tgz", - "integrity": "sha512-7mz/721AbnJwIVbnaSv1Cz3Am0ZLT/UBwkC92VlxhXv/k/BBQfM2fXElQNC27BVGr0uwUpplYPQM9LnaBMR5NQ==", + "node_modules/is-generator-function": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/is-generator-function/-/is-generator-function-1.1.2.tgz", + "integrity": "sha512-upqt1SkGkODW9tsGNG5mtXTXtECizwtS2kA161M+gJPc1xdb/Ax629af6YrTwcOeQHbewrPNlE5Dx7kzvXTizA==", "dev": true, "license": "MIT", "dependencies": { - "eventemitter3": "^4.0.0", - "follow-redirects": "^1.0.0", - "requires-port": "^1.0.0" + "call-bound": "^1.0.4", + "generator-function": "^2.0.0", + "get-proto": "^1.0.1", + "has-tostringtag": "^1.0.2", + "safe-regex-test": "^1.1.0" }, "engines": { - "node": ">=8.0.0" + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/http-server": { - "version": "14.1.1", - "resolved": "https://registry.npmjs.org/http-server/-/http-server-14.1.1.tgz", - "integrity": "sha512-+cbxadF40UXd9T01zUHgA+rlo2Bg1Srer4+B4NwIHdaGxAGGv59nYRnGGDJ9LBk7alpS0US+J+bLLdQOOkJq4A==", + "node_modules/is-glob": { + "version": "4.0.3", + "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-4.0.3.tgz", + "integrity": "sha512-xelSayHH36ZgE7ZWhli7pW34hNbNl8Ojv5KVmkJD4hBdD3th8Tfk9vYasLM+mXWOZhFkgZfxhLSnrwRr4elSSg==", "dev": true, "license": "MIT", "dependencies": { - "basic-auth": "^2.0.1", - "chalk": "^4.1.2", - "corser": "^2.0.1", - "he": "^1.2.0", - "html-encoding-sniffer": "^3.0.0", - "http-proxy": "^1.18.1", - "mime": "^1.6.0", - "minimist": "^1.2.6", - "opener": "^1.5.1", - "portfinder": "^1.0.28", - "secure-compare": "3.0.1", - "union": "~0.5.0", - "url-join": "^4.0.1" - }, - "bin": { - "http-server": "bin/http-server" + "is-extglob": "^2.1.1" }, "engines": { - "node": ">=12" + "node": ">=0.10.0" } }, - "node_modules/iconv-lite": { - "version": "0.6.3", - "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.3.tgz", - "integrity": "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==", + "node_modules/is-inside-container": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/is-inside-container/-/is-inside-container-1.0.0.tgz", + "integrity": "sha512-KIYLCCJghfHZxqjYBE7rEy0OBuTd5xCHS7tHVgvCLkx7StIoaxwNW3hCALgEUjFfeRk+MG/Qxmp/vtETEF3tRA==", "dev": true, "license": "MIT", "dependencies": { - "safer-buffer": ">= 2.1.2 < 3.0.0" + "is-docker": "^3.0.0" + }, + "bin": { + "is-inside-container": "cli.js" }, "engines": { - "node": ">=0.10.0" + "node": ">=14.16" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" } }, - "node_modules/ignore": { - "version": "5.3.2", - "resolved": "https://registry.npmjs.org/ignore/-/ignore-5.3.2.tgz", - "integrity": "sha512-hsBTNUqQTDwkWtcdYI2i06Y/nUBEsNEDJKjWdigLvegy8kDuJAS8uRlpkkcQpyEXL0Z/pjDy5HBmMjRCJ2gq+g==", + "node_modules/is-map": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/is-map/-/is-map-2.0.3.tgz", + "integrity": "sha512-1Qed0/Hr2m+YqxnM09CjA2d/i6YZNfF6R2oRAOj36eUdS6qIV/huPJNSEpKbupewFs+ZsJlxsjjPbc0/afW6Lw==", "dev": true, "license": "MIT", "engines": { - "node": ">= 4" + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/immutable": { - "version": "5.1.5", - "resolved": "https://registry.npmjs.org/immutable/-/immutable-5.1.5.tgz", - "integrity": "sha512-t7xcm2siw+hlUM68I+UEOK+z84RzmN59as9DZ7P1l0994DKUWV7UXBMQZVxaoMSRQ+PBZbHCOoBt7a2wxOMt+A==", + "node_modules/is-module": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/is-module/-/is-module-1.0.0.tgz", + "integrity": "sha512-51ypPSPCoTEIN9dy5Oy+h4pShgJmPCygKfyRCISBI+JoWT/2oJvK8QPxmwv7b/p239jXrm9M1mlQbyKJ5A152g==", "dev": true, "license": "MIT" }, - "node_modules/import-fresh": { - "version": "3.3.1", - "resolved": "https://registry.npmjs.org/import-fresh/-/import-fresh-3.3.1.tgz", - "integrity": "sha512-TR3KfrTZTYLPB6jUjfx6MF9WcWrHL9su5TObK4ZkYgBdWKPOFoSoQIdEuTuR82pmtxH2spWG9h6etwfr1pLBqQ==", + "node_modules/is-negative-zero": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/is-negative-zero/-/is-negative-zero-2.0.3.tgz", + "integrity": "sha512-5KoIu2Ngpyek75jXodFvnafB6DJgr3u8uuK0LEZJjrU19DrMD3EVERaR8sjz8CCGgpZvxPl9SuE1GMVPFHx1mw==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/is-number-object": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/is-number-object/-/is-number-object-1.1.1.tgz", + "integrity": "sha512-lZhclumE1G6VYD8VHe35wFaIif+CTy5SJIi5+3y4psDgWu4wPDoBhF8NxUOinEc7pHgiTsT6MaBb92rKhhD+Xw==", "dev": true, "license": "MIT", "dependencies": { - "parent-module": "^1.0.0", - "resolve-from": "^4.0.0" + "call-bound": "^1.0.3", + "has-tostringtag": "^1.0.2" }, "engines": { - "node": ">=6" + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/is-obj": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/is-obj/-/is-obj-1.0.1.tgz", + "integrity": "sha512-l4RyHgRqGN4Y3+9JHVrNqO+tN0rV5My76uW5/nuO4K1b6vw5G8d/cmFjP9tRfEsdhZNt0IFdZuK/c2Vr4Nb+Qg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/is-plain-obj": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/is-plain-obj/-/is-plain-obj-4.1.0.tgz", + "integrity": "sha512-+Pgi+vMuUNkJyExiMBt5IlFoMyKnr5zhJ4Uspz58WOhBF5QoIZkFyNHIbBAtHwzVAgk5RtndVNsDRN61/mmDqg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=12" }, "funding": { "url": "https://github.com/sponsors/sindresorhus" } }, - "node_modules/import-meta-resolve": { - "version": "4.2.0", - "resolved": "https://registry.npmjs.org/import-meta-resolve/-/import-meta-resolve-4.2.0.tgz", - "integrity": "sha512-Iqv2fzaTQN28s/FwZAoFq0ZSs/7hMAHJVX+w8PZl3cY19Pxk6jFFalxQoIfW2826i/fDLXv8IiEZRIT0lDuWcg==", + "node_modules/is-promise": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/is-promise/-/is-promise-4.0.0.tgz", + "integrity": "sha512-hvpoI6korhJMnej285dSg6nu1+e6uxs7zG3BYAm5byqDsgJNWwxzM6z6iZiAgQR4TJ30JmBTOwqZUw3WlyH3AQ==", + "dev": true, + "license": "MIT" + }, + "node_modules/is-reference": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/is-reference/-/is-reference-3.0.3.tgz", + "integrity": "sha512-ixkJoqQvAP88E6wLydLGGqCJsrFUnqoH6HnaczB8XmDH1oaWU+xxdptvikTgaEhtZ53Ky6YXiBuUI2WXLMCwjw==", "dev": true, "license": "MIT", - "funding": { - "type": "github", - "url": "https://github.com/sponsors/wooorm" + "dependencies": { + "@types/estree": "^1.0.6" } }, - "node_modules/imurmurhash": { - "version": "0.1.4", - "resolved": "https://registry.npmjs.org/imurmurhash/-/imurmurhash-0.1.4.tgz", - "integrity": "sha512-JmXMZ6wuvDmLiHEml9ykzqO6lwFbof0GG4IkcGaENdCRDDmMVnny7s5HsIgHCbaq0w2MyPhDqkhTUgS2LU2PHA==", + "node_modules/is-regex": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/is-regex/-/is-regex-1.2.1.tgz", + "integrity": "sha512-MjYsKHO5O7mCsmRGxWcLWheFqN9DJ/2TmngvjKXihe6efViPqc274+Fx/4fYj/r03+ESvBdTXK0V6tA3rgez1g==", "dev": true, "license": "MIT", + "dependencies": { + "call-bound": "^1.0.2", + "gopd": "^1.2.0", + "has-tostringtag": "^1.0.2", + "hasown": "^2.0.2" + }, "engines": { - "node": ">=0.8.19" + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/indent-string": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/indent-string/-/indent-string-4.0.0.tgz", - "integrity": "sha512-EdDDZu4A2OyIK7Lr/2zG+w5jmbuk1DVBnEwREQvBzspBJkCEbRa8GxU1lghYcaGJCnRWibjDXlq779X1/y5xwg==", + "node_modules/is-regexp": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/is-regexp/-/is-regexp-1.0.0.tgz", + "integrity": "sha512-7zjFAPO4/gwyQAAgRRmqeEeyIICSdmCqa3tsVHMdBzaXXRiqopZL4Cyghg/XulGWrtABTpbnYYzzIRffLkP4oA==", "dev": true, "license": "MIT", "engines": { - "node": ">=8" + "node": ">=0.10.0" } }, - "node_modules/inherits": { - "version": "2.0.4", - "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz", - "integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==", - "dev": true, - "license": "ISC" - }, - "node_modules/inline-style-parser": { - "version": "0.2.4", - "resolved": "https://registry.npmjs.org/inline-style-parser/-/inline-style-parser-0.2.4.tgz", - "integrity": "sha512-0aO8FkhNZlj/ZIbNi7Lxxr12obT7cL1moPfE4tg1LkX7LlLfC6DeX4l2ZEud1ukP9jNQyNnfzQVqwbwmAATY4Q==", - "dev": true, - "license": "MIT" - }, - "node_modules/internmap": { + "node_modules/is-set": { "version": "2.0.3", - "resolved": "https://registry.npmjs.org/internmap/-/internmap-2.0.3.tgz", - "integrity": "sha512-5Hh7Y1wQbvY5ooGgPbDaL5iYLAPzMTUrjMulskHLH6wnv/A+1q5rgEaiuqEjB+oxGXIVZs1FF+R/KPN3ZSQYYg==", - "dev": true, - "license": "ISC", - "engines": { - "node": ">=12" - } - }, - "node_modules/ip-address": { - "version": "10.2.0", - "resolved": "https://registry.npmjs.org/ip-address/-/ip-address-10.2.0.tgz", - "integrity": "sha512-/+S6j4E9AHvW9SWMSEY9Xfy66O5PWvVEJ08O0y5JGyEKQpojb0K0GKpz/v5HJ/G0vi3D2sjGK78119oXZeE0qA==", + "resolved": "https://registry.npmjs.org/is-set/-/is-set-2.0.3.tgz", + "integrity": "sha512-iPAjerrse27/ygGLxw+EBR9agv9Y6uLeYVJMu+QNCoouJ1/1ri0mGrcWpfCqFZuzzx3WjtwxG098X+n4OuRkPg==", "dev": true, "license": "MIT", "engines": { - "node": ">= 12" + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/ipaddr.js": { - "version": "1.9.1", - "resolved": "https://registry.npmjs.org/ipaddr.js/-/ipaddr.js-1.9.1.tgz", - "integrity": "sha512-0KI/607xoxSToH7GjN1FfSbLoU0+btTicjsQSWQlh/hZykN8KpmMf7uYwPW3R+akZ6R/w18ZlXSHBYXiYUPO3g==", + "node_modules/is-shared-array-buffer": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/is-shared-array-buffer/-/is-shared-array-buffer-1.0.4.tgz", + "integrity": "sha512-ISWac8drv4ZGfwKl5slpHG9OwPNty4jOWPRIhBpxOoD+hqITiwuipOQ2bNthAzwA3B4fIjO4Nln74N0S9byq8A==", "dev": true, "license": "MIT", + "dependencies": { + "call-bound": "^1.0.3" + }, "engines": { - "node": ">= 0.10" + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/is-docker": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/is-docker/-/is-docker-3.0.0.tgz", - "integrity": "sha512-eljcgEDlEns/7AXFosB5K/2nCM4P7FQPkGc/DWLy5rmFEWvZayGrik1d9/QIY5nJ4f9YsVvBkA6kJpHn9rISdQ==", + "node_modules/is-stream": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/is-stream/-/is-stream-2.0.1.tgz", + "integrity": "sha512-hFoiJiTl63nn+kstHGBtewWSKnQLpyb155KHheA1l39uvtO9nWIop1p3udqPcUd/xbF1VLMO4n7OI6p7RbngDg==", "dev": true, "license": "MIT", - "bin": { - "is-docker": "cli.js" - }, "engines": { - "node": "^12.20.0 || ^14.13.1 || >=16.0.0" + "node": ">=8" }, "funding": { "url": "https://github.com/sponsors/sindresorhus" } }, - "node_modules/is-extglob": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-2.1.1.tgz", - "integrity": "sha512-SbKbANkN603Vi4jEZv49LeVJMn4yGwsbzZworEoyEiutsN3nJYdbO36zfhGJ6QEDpOZIFkDtnq5JRxmvl3jsoQ==", + "node_modules/is-string": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/is-string/-/is-string-1.1.1.tgz", + "integrity": "sha512-BtEeSsoaQjlSPBemMQIrY1MY0uM6vnS1g5fmufYOtnxLGUZM2178PKbhsk7Ffv58IX+ZtcvoGwccYsh0PglkAA==", "dev": true, "license": "MIT", + "dependencies": { + "call-bound": "^1.0.3", + "has-tostringtag": "^1.0.2" + }, "engines": { - "node": ">=0.10.0" + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/is-glob": { - "version": "4.0.3", - "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-4.0.3.tgz", - "integrity": "sha512-xelSayHH36ZgE7ZWhli7pW34hNbNl8Ojv5KVmkJD4hBdD3th8Tfk9vYasLM+mXWOZhFkgZfxhLSnrwRr4elSSg==", + "node_modules/is-symbol": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/is-symbol/-/is-symbol-1.1.1.tgz", + "integrity": "sha512-9gGx6GTtCQM73BgmHQXfDmLtfjjTUDSyoxTCbp5WtoixAhfgsDirWIcVQ/IHpvI5Vgd5i/J5F7B9cN/WlVbC/w==", "dev": true, "license": "MIT", "dependencies": { - "is-extglob": "^2.1.1" + "call-bound": "^1.0.2", + "has-symbols": "^1.1.0", + "safe-regex-test": "^1.1.0" }, "engines": { - "node": ">=0.10.0" + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/is-inside-container": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-inside-container/-/is-inside-container-1.0.0.tgz", - "integrity": "sha512-KIYLCCJghfHZxqjYBE7rEy0OBuTd5xCHS7tHVgvCLkx7StIoaxwNW3hCALgEUjFfeRk+MG/Qxmp/vtETEF3tRA==", + "node_modules/is-typed-array": { + "version": "1.1.15", + "resolved": "https://registry.npmjs.org/is-typed-array/-/is-typed-array-1.1.15.tgz", + "integrity": "sha512-p3EcsicXjit7SaskXHs1hA91QxgTw46Fv6EFKKGS5DRFLD8yKnohjF3hxoju94b/OcMZoQukzpPpBE9uLVKzgQ==", "dev": true, "license": "MIT", "dependencies": { - "is-docker": "^3.0.0" - }, - "bin": { - "is-inside-container": "cli.js" + "which-typed-array": "^1.1.16" }, "engines": { - "node": ">=14.16" + "node": ">= 0.4" }, "funding": { - "url": "https://github.com/sponsors/sindresorhus" + "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/is-number": { - "version": "7.0.0", - "resolved": "https://registry.npmjs.org/is-number/-/is-number-7.0.0.tgz", - "integrity": "sha512-41Cifkg6e8TylSpdtTpeLVMqvSBEVzTttHvERD741+pnZ8ANv0004MRL43QKPDlK9cGvNp6NZWZUBlbGXYxxng==", + "node_modules/is-weakmap": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/is-weakmap/-/is-weakmap-2.0.2.tgz", + "integrity": "sha512-K5pXYOm9wqY1RgjpL3YTkF39tni1XajUIkawTLUo9EZEVUFga5gSQJF8nNS7ZwJQ02y+1YCNYcMh+HIf1ZqE+w==", "dev": true, "license": "MIT", - "optional": true, "engines": { - "node": ">=0.12.0" + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/is-plain-obj": { - "version": "4.1.0", - "resolved": "https://registry.npmjs.org/is-plain-obj/-/is-plain-obj-4.1.0.tgz", - "integrity": "sha512-+Pgi+vMuUNkJyExiMBt5IlFoMyKnr5zhJ4Uspz58WOhBF5QoIZkFyNHIbBAtHwzVAgk5RtndVNsDRN61/mmDqg==", + "node_modules/is-weakref": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/is-weakref/-/is-weakref-1.1.1.tgz", + "integrity": "sha512-6i9mGWSlqzNMEqpCp93KwRS1uUOodk2OJ6b+sq7ZPDSy2WuI5NFIxp/254TytR8ftefexkWn5xNiHUNpPOfSew==", "dev": true, "license": "MIT", + "dependencies": { + "call-bound": "^1.0.3" + }, "engines": { - "node": ">=12" + "node": ">= 0.4" }, "funding": { - "url": "https://github.com/sponsors/sindresorhus" + "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/is-promise": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/is-promise/-/is-promise-4.0.0.tgz", - "integrity": "sha512-hvpoI6korhJMnej285dSg6nu1+e6uxs7zG3BYAm5byqDsgJNWwxzM6z6iZiAgQR4TJ30JmBTOwqZUw3WlyH3AQ==", + "node_modules/is-weakset": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/is-weakset/-/is-weakset-2.0.4.tgz", + "integrity": "sha512-mfcwb6IzQyOKTs84CQMrOwW4gQcaTOAWJ0zzJCl2WSPDrWk/OzDaImWFH3djXhb24g4eudZfLRozAvPGw4d9hQ==", "dev": true, - "license": "MIT" + "license": "MIT", + "dependencies": { + "call-bound": "^1.0.3", + "get-intrinsic": "^1.2.6" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } }, "node_modules/is-wsl": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/is-wsl/-/is-wsl-3.1.0.tgz", - "integrity": "sha512-UcVfVfaK4Sc4m7X3dUSoHoozQGBEFeDC+zVo06t98xe8CzHSZZBekNXH+tu0NalHolcJ/QAGqS46Hef7QXBIMw==", + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/is-wsl/-/is-wsl-3.1.1.tgz", + "integrity": "sha512-e6rvdUCiQCAuumZslxRJWR/Doq4VpPR82kqclvcS0efgt430SlGIk05vdCN58+VrzgtIcfNODjozVielycD4Sw==", "dev": true, "license": "MIT", "dependencies": { @@ -7000,6 +10995,13 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/isarray": { + "version": "2.0.5", + "resolved": "https://registry.npmjs.org/isarray/-/isarray-2.0.5.tgz", + "integrity": "sha512-xHjhDr3cNBK0BzdUJSPXZntQUx/mwMS5Rw4A7lPJ90XGAO6ISP/ePDNuo0vhqOZU+UD5JoodwCAAoZQd3FeAKw==", + "dev": true, + "license": "MIT" + }, "node_modules/isexe": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/isexe/-/isexe-2.0.0.tgz", @@ -7046,10 +11048,28 @@ "node": ">=8" } }, + "node_modules/jake": { + "version": "10.9.4", + "resolved": "https://registry.npmjs.org/jake/-/jake-10.9.4.tgz", + "integrity": "sha512-wpHYzhxiVQL+IV05BLE2Xn34zW1S223hvjtqk0+gsPrwd/8JNLXJgZZM/iPFsYc1xyphF+6M6EvdE5E9MBGkDA==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "async": "^3.2.6", + "filelist": "^1.0.4", + "picocolors": "^1.1.1" + }, + "bin": { + "jake": "bin/cli.js" + }, + "engines": { + "node": ">=10" + } + }, "node_modules/jiti": { - "version": "2.4.2", - "resolved": "https://registry.npmjs.org/jiti/-/jiti-2.4.2.tgz", - "integrity": "sha512-rg9zJN+G4n2nfJl5MW3BMygZX56zKPNVEYYqq7adpmMh4Jn2QNEwhvQlFy6jPVdcod7txZtKHWnyZiA3a0zP7A==", + "version": "2.7.0", + "resolved": "https://registry.npmjs.org/jiti/-/jiti-2.7.0.tgz", + "integrity": "sha512-AC/7JofJvZGrrneWNaEnJeOLUx+JlGt7tNa0wZiRPT4MY1wmfKjt2+6O2p2uz2+skll8OZZmJMNqeke7kKbNgQ==", "dev": true, "license": "MIT", "bin": { @@ -7057,9 +11077,9 @@ } }, "node_modules/jose": { - "version": "6.1.3", - "resolved": "https://registry.npmjs.org/jose/-/jose-6.1.3.tgz", - "integrity": "sha512-0TpaTfihd4QMNwrz/ob2Bp7X04yuxJkjRGi4aKmOqwhov54i6u79oCv7T+C7lo70MKH6BesI3vscD1yb/yzKXQ==", + "version": "6.2.3", + "resolved": "https://registry.npmjs.org/jose/-/jose-6.2.3.tgz", + "integrity": "sha512-YYVDInQKFJfR/xa3ojUTl8c2KoTwiL1R5Wg9YCydwH0x0B9grbzlg5HC7mMjCtUJjbQ/YnGEZIhI5tCgfTb4Hw==", "dev": true, "license": "MIT", "funding": { @@ -7067,17 +11087,27 @@ } }, "node_modules/js-tokens": { - "version": "10.0.0", - "resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-10.0.0.tgz", - "integrity": "sha512-lM/UBzQmfJRo9ABXbPWemivdCW8V2G8FHaHdypQaIy523snUjog0W71ayWXTjiR+ixeMyVHN2XcpnTd/liPg/Q==", + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-4.0.0.tgz", + "integrity": "sha512-RdJUflcE3cUzKiMqQgsCu06FPu9UdIJO0beYbPhHN4k6apgJtifcoCtT9bcxOpYBtpD2kCM6Sbzg4CausW/PKQ==", "dev": true, "license": "MIT" }, "node_modules/js-yaml": { - "version": "4.1.1", - "resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-4.1.1.tgz", - "integrity": "sha512-qQKT4zQxXl8lLwBtHMWwaTcGfFOZviOJet3Oy/xmGk2gZH677CJM9EvtfdSkgWcATZhj/55JZ0rmy3myCT5lsA==", + "version": "4.2.0", + "resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-4.2.0.tgz", + "integrity": "sha512-ePWsvanv0DWuDRsW8dnt+R4jQ31SCRCQ7hhNcPXZPsoBZiemuZNYGf7adZdqX2D86j6rvKp3RpCxVTSb8WQlOw==", "dev": true, + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/puzrin" + }, + { + "type": "github", + "url": "https://github.com/sponsors/nodeca" + } + ], "license": "MIT", "dependencies": { "argparse": "^2.0.1" @@ -7086,6 +11116,19 @@ "js-yaml": "bin/js-yaml.js" } }, + "node_modules/jsesc": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/jsesc/-/jsesc-3.1.0.tgz", + "integrity": "sha512-/sM3dO2FOzXjKQhJuo0Q173wf2KOo8t4I8vHy6lF9poUp7bKT0/NHE8fPX23PwfhnykfqnC2xRxOnVw5XuGIaA==", + "dev": true, + "license": "MIT", + "bin": { + "jsesc": "bin/jsesc" + }, + "engines": { + "node": ">=6" + } + }, "node_modules/json-buffer": { "version": "3.0.1", "resolved": "https://registry.npmjs.org/json-buffer/-/json-buffer-3.0.1.tgz", @@ -7094,9 +11137,9 @@ "license": "MIT" }, "node_modules/json-schema-traverse": { - "version": "0.4.1", - "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-0.4.1.tgz", - "integrity": "sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg==", + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz", + "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==", "dev": true, "license": "MIT" }, @@ -7114,10 +11157,23 @@ "dev": true, "license": "MIT" }, + "node_modules/json5": { + "version": "2.2.3", + "resolved": "https://registry.npmjs.org/json5/-/json5-2.2.3.tgz", + "integrity": "sha512-XmOWe7eyHYH14cLdVPoyg+GOH3rYX++KpzrylJwSW98t3Nk+U8XOl8FWKOgwtzdb8lXGf6zYwDUzeHMWfxasyg==", + "dev": true, + "license": "MIT", + "bin": { + "json5": "lib/cli.js" + }, + "engines": { + "node": ">=6" + } + }, "node_modules/jsonfile": { - "version": "6.1.0", - "resolved": "https://registry.npmjs.org/jsonfile/-/jsonfile-6.1.0.tgz", - "integrity": "sha512-5dgndWOriYSm5cnYaJNhalLNDKOqFwyDB/rr1E9ZsGciGvKPs8R2xYGCacuf3z6K1YKDz182fd+fY3cn3pMqXQ==", + "version": "6.2.1", + "resolved": "https://registry.npmjs.org/jsonfile/-/jsonfile-6.2.1.tgz", + "integrity": "sha512-zwOTdL3rFQ/lRdBnntKVOX6k5cKJwEc1HdilT71BWEu7J41gXIB2MRp+vxduPSwZJPWBxEzv4yH1wYLJGUHX4Q==", "dev": true, "license": "MIT", "dependencies": { @@ -7127,6 +11183,16 @@ "graceful-fs": "^4.1.6" } }, + "node_modules/jsonpointer": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/jsonpointer/-/jsonpointer-5.0.1.tgz", + "integrity": "sha512-p/nXbhSEcu3pZRdkW1OfJhpsVtW1gd4Wa1fnQc9YLiTfAjn0312eMKimbdIQzuZl9aa9xUGaRlP9T/CJE/ditQ==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/katex": { "version": "0.16.47", "resolved": "https://registry.npmjs.org/katex/-/katex-0.16.47.tgz", @@ -7144,6 +11210,16 @@ "katex": "cli.js" } }, + "node_modules/katex/node_modules/commander": { + "version": "8.3.0", + "resolved": "https://registry.npmjs.org/commander/-/commander-8.3.0.tgz", + "integrity": "sha512-OkTL9umf+He2DZkUq8f8J9of7yL6RJKI24dVITBmNfZBmri9zYZQrKkuXiKhyfPSu8tUhnVBB1iKXevvnlR4Ww==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 12" + } + }, "node_modules/keyv": { "version": "4.5.4", "resolved": "https://registry.npmjs.org/keyv/-/keyv-4.5.4.tgz", @@ -7177,6 +11253,13 @@ "dev": true, "license": "MIT" }, + "node_modules/kolorist": { + "version": "1.8.0", + "resolved": "https://registry.npmjs.org/kolorist/-/kolorist-1.8.0.tgz", + "integrity": "sha512-Y+60/zizpJ3HRH8DCss+q95yr6145JXZo46OTpFvDZWLfRCE4qChOyk1b26nMaNpfHHgxagk9dXT5OP0Tfe+dQ==", + "dev": true, + "license": "MIT" + }, "node_modules/layout-base": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/layout-base/-/layout-base-1.0.2.tgz", @@ -7184,6 +11267,16 @@ "dev": true, "license": "MIT" }, + "node_modules/leven": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/leven/-/leven-3.1.0.tgz", + "integrity": "sha512-qsda+H8jTaUaN/x5vzW2rzc+8Rw4TAQ/4KjB46IwK5VH+IlVeeeje/EoZRpiXvIqjFgK84QffqPztGI3VBLG1A==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6" + } + }, "node_modules/levn": { "version": "0.4.1", "resolved": "https://registry.npmjs.org/levn/-/levn-0.4.1.tgz", @@ -7484,6 +11577,13 @@ "dev": true, "license": "MIT" }, + "node_modules/lodash.debounce": { + "version": "4.0.8", + "resolved": "https://registry.npmjs.org/lodash.debounce/-/lodash.debounce-4.0.8.tgz", + "integrity": "sha512-FT1yDzDYEoYWhnSGnpE/4Kj1fLZkDFyqRb7fNt6FdYOSxlUWAtp42Eh6Wb0rGIv/m9Bgo7x4GhQbm5Ys4SG5ow==", + "dev": true, + "license": "MIT" + }, "node_modules/lodash.isplainobject": { "version": "4.0.6", "resolved": "https://registry.npmjs.org/lodash.isplainobject/-/lodash.isplainobject-4.0.6.tgz", @@ -7498,6 +11598,13 @@ "dev": true, "license": "MIT" }, + "node_modules/lodash.sortby": { + "version": "4.7.0", + "resolved": "https://registry.npmjs.org/lodash.sortby/-/lodash.sortby-4.7.0.tgz", + "integrity": "sha512-HDWXG8isMntAyRF5vZ7xKuEvOhT4AhlRt/3czTSjvGUxjYCBVRQY48ViDHyfYz9VIoBkW4TMGQNapx+l3RUwdA==", + "dev": true, + "license": "MIT" + }, "node_modules/longest-streak": { "version": "3.1.0", "resolved": "https://registry.npmjs.org/longest-streak/-/longest-streak-3.1.0.tgz", @@ -7510,9 +11617,9 @@ } }, "node_modules/loupe": { - "version": "3.1.4", - "resolved": "https://registry.npmjs.org/loupe/-/loupe-3.1.4.tgz", - "integrity": "sha512-wJzkKwJrheKtknCOKNEtDK4iqg/MxmZheEMtSTYvnzRdEYaZzmgH976nenp8WdJRdx5Vc1X/9MO0Oszl6ezeXg==", + "version": "3.2.1", + "resolved": "https://registry.npmjs.org/loupe/-/loupe-3.2.1.tgz", + "integrity": "sha512-CdzqowRJCeLU72bHvWqwRBBlLcMEtIvGrlvef74kMnV2AolS9Y8xUv1I0U/MNAWMhBlKIoyuEgoJ0t/bbwHbLQ==", "dev": true, "license": "MIT" }, @@ -7618,6 +11725,7 @@ "version": "3.0.0", "resolved": "https://registry.npmjs.org/mdast/-/mdast-3.0.0.tgz", "integrity": "sha512-xySmf8g4fPKMeC07jXGz971EkLbWAJ83s4US2Tj9lEdnZ142UP5grN73H1Xd3HzrdbU5o9GYYP/y8F9ZSwLE9g==", + "deprecated": "`mdast` was renamed to `remark`", "dev": true, "license": "MIT" }, @@ -7652,9 +11760,9 @@ } }, "node_modules/mdast-util-from-markdown": { - "version": "2.0.2", - "resolved": "https://registry.npmjs.org/mdast-util-from-markdown/-/mdast-util-from-markdown-2.0.2.tgz", - "integrity": "sha512-uZhTV/8NBuw0WHkPTrCqDOl0zVe1BIng5ZtHoDk49ME1qqcjYmmLmOf0gELgcRMxN4w2iuIeVso5/6QymSrgmA==", + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/mdast-util-from-markdown/-/mdast-util-from-markdown-2.0.3.tgz", + "integrity": "sha512-W4mAWTvSlKvf8L6J+VN9yLSqQ9AOAAvHuoDAmPkz4dHf553m5gVj2ejadHJhoJmcmxEnOv6Pa8XJhpxE93kb8Q==", "dev": true, "license": "MIT", "dependencies": { @@ -7681,21 +11789,7 @@ "resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.3.tgz", "integrity": "sha512-ko/gIFJRv177XgZsZcBwnqJN5x/Gien8qNOn0D5bQU/zAzVf9Zt3BlcUiLqhV9y4ARk0GbT3tnUiPNgnTXzc/Q==", "dev": true, - "license": "MIT" - }, - "node_modules/mdast-util-from-markdown/node_modules/unist-util-stringify-position": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/unist-util-stringify-position/-/unist-util-stringify-position-4.0.0.tgz", - "integrity": "sha512-0ASV06AAoKCDkS2+xw5RXJywruurpbC4JZSm7nr7MOt1ojAzvyyaO+UxZf18j8FCF6kmzCZKcAgN/yu2gm2XgQ==", - "dev": true, - "license": "MIT", - "dependencies": { - "@types/unist": "^3.0.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/unified" - } + "license": "MIT" }, "node_modules/mdast-util-gfm": { "version": "3.1.0", @@ -7920,9 +12014,9 @@ } }, "node_modules/mdsvex": { - "version": "0.12.6", - "resolved": "https://registry.npmjs.org/mdsvex/-/mdsvex-0.12.6.tgz", - "integrity": "sha512-pupx2gzWh3hDtm/iDW4WuCpljmyHbHi34r7ktOqpPGvyiM4MyfNgdJ3qMizXdgCErmvYC9Nn/qyjePy+4ss9Wg==", + "version": "0.12.7", + "resolved": "https://registry.npmjs.org/mdsvex/-/mdsvex-0.12.7.tgz", + "integrity": "sha512-gx4bReLCUvq+MPErHXYeyX+TEq1hsS2KfiZtEOMNTcbibSouFy8AHc5h04KbGCl+g5tLuo4/lbgRVYRnc7bJZw==", "dev": true, "license": "MIT", "dependencies": { @@ -8643,35 +12737,6 @@ ], "license": "MIT" }, - "node_modules/micromatch": { - "version": "4.0.8", - "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-4.0.8.tgz", - "integrity": "sha512-PXwfBhYu0hBCPw8Dn0E+WDYb7af3dSLVWKi3HGv84IdF4TyFoC0ysxFd0Goxw7nSv4T/PzEJQxsYsEiFCKo2BA==", - "dev": true, - "license": "MIT", - "optional": true, - "dependencies": { - "braces": "^3.0.3", - "picomatch": "^2.3.1" - }, - "engines": { - "node": ">=8.6" - } - }, - "node_modules/micromatch/node_modules/picomatch": { - "version": "2.3.2", - "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-2.3.2.tgz", - "integrity": "sha512-V7+vQEJ06Z+c5tSye8S+nHUfI51xoXIXjHQ99cQtKUkQqqO1kO/KCJUfZXuB47h/YBlDhah2H3hdUGXn8ie0oA==", - "dev": true, - "license": "MIT", - "optional": true, - "engines": { - "node": ">=8.6" - }, - "funding": { - "url": "https://github.com/sponsors/jonschlinkert" - } - }, "node_modules/mime": { "version": "1.6.0", "resolved": "https://registry.npmjs.org/mime/-/mime-1.6.0.tgz", @@ -8756,11 +12821,11 @@ } }, "node_modules/minipass": { - "version": "7.1.2", - "resolved": "https://registry.npmjs.org/minipass/-/minipass-7.1.2.tgz", - "integrity": "sha512-qOOzS1cBTWYF4BH8fVePDBOO9iptMnGUEZwNc/cMWnTV2nVLZ7VoNWEPHkYczZA0pdoA7dl6e7FL659nX9S2aw==", + "version": "7.1.3", + "resolved": "https://registry.npmjs.org/minipass/-/minipass-7.1.3.tgz", + "integrity": "sha512-tEBHqDnIoM/1rXME1zgka9g6Q2lcoCkxHLuc7ODJ5BxbP5d4c2Z5cGgtXAku59200Cx7diuHTOYfSBD8n6mm8A==", "dev": true, - "license": "ISC", + "license": "BlueOak-1.0.0", "engines": { "node": ">=16 || 14 >=14.17" } @@ -8792,6 +12857,59 @@ "svelte": "^5.27.0" } }, + "node_modules/mode-watcher/node_modules/runed": { + "version": "0.25.0", + "resolved": "https://registry.npmjs.org/runed/-/runed-0.25.0.tgz", + "integrity": "sha512-7+ma4AG9FT2sWQEA0Egf6mb7PBT2vHyuHail1ie8ropfSjvZGtEAx8YTmUjv/APCsdRRxEVvArNjALk9zFSOrg==", + "dev": true, + "funding": [ + "https://github.com/sponsors/huntabyte", + "https://github.com/sponsors/tglide" + ], + "dependencies": { + "esm-env": "^1.0.0" + }, + "peerDependencies": { + "svelte": "^5.7.0" + } + }, + "node_modules/mode-watcher/node_modules/svelte-toolbelt": { + "version": "0.7.1", + "resolved": "https://registry.npmjs.org/svelte-toolbelt/-/svelte-toolbelt-0.7.1.tgz", + "integrity": "sha512-HcBOcR17Vx9bjaOceUvxkY3nGmbBmCBBbuWLLEWO6jtmWH8f/QoWmbyUfQZrpDINH39en1b8mptfPQT9VKQ1xQ==", + "dev": true, + "funding": [ + "https://github.com/sponsors/huntabyte" + ], + "dependencies": { + "clsx": "^2.1.1", + "runed": "^0.23.2", + "style-to-object": "^1.0.8" + }, + "engines": { + "node": ">=18", + "pnpm": ">=8.7.0" + }, + "peerDependencies": { + "svelte": "^5.0.0" + } + }, + "node_modules/mode-watcher/node_modules/svelte-toolbelt/node_modules/runed": { + "version": "0.23.4", + "resolved": "https://registry.npmjs.org/runed/-/runed-0.23.4.tgz", + "integrity": "sha512-9q8oUiBYeXIDLWNK5DfCWlkL0EW3oGbk845VdKlPeia28l751VpfesaB/+7pI6rnbx1I6rqoZ2fZxptOJLxILA==", + "dev": true, + "funding": [ + "https://github.com/sponsors/huntabyte", + "https://github.com/sponsors/tglide" + ], + "dependencies": { + "esm-env": "^1.0.0" + }, + "peerDependencies": { + "svelte": "^5.7.0" + } + }, "node_modules/mri": { "version": "1.2.0", "resolved": "https://registry.npmjs.org/mri/-/mri-1.2.0.tgz", @@ -8820,9 +12938,9 @@ "license": "MIT" }, "node_modules/nanoid": { - "version": "3.3.11", - "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.11.tgz", - "integrity": "sha512-N8SpfPUnUp1bK+PMYW8qSWdl9U+wwNWI4QKxOYDy9JAro3WMX7p2OeVRF9v+347pnakNevPmiHhNmZ2HbFA76w==", + "version": "3.3.12", + "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.12.tgz", + "integrity": "sha512-ZB9RH/39qpq5Vu6Y+NmUaFhQR6pp+M2Xt76XBnEwDaGcVAqhlvxrl3B2bKS5D3NH3QR76v3aSrKaF/Kiy7lEtQ==", "dev": true, "funding": [ { @@ -8863,6 +12981,16 @@ "license": "MIT", "optional": true }, + "node_modules/node-releases": { + "version": "2.0.47", + "resolved": "https://registry.npmjs.org/node-releases/-/node-releases-2.0.47.tgz", + "integrity": "sha512-Uzmd6LXpouKo8EUK68IjH4+E01w/hXyV3R3g/geCJo+rXLNfh1xucB+LOzYEOQPSiUK3h/xZf0cQGcSsmyL2Og==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=18" + } + }, "node_modules/object-assign": { "version": "4.1.1", "resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz", @@ -8886,6 +13014,37 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/object-keys": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/object-keys/-/object-keys-1.1.1.tgz", + "integrity": "sha512-NuAESUOUMrlIXOfHKzD6bpPu3tYt3xvjNdRIQ+FeT0lNb4K8WR70CaDxhuNguS2XG+GjkyMwOzsN5ZktImfhLA==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/object.assign": { + "version": "4.1.7", + "resolved": "https://registry.npmjs.org/object.assign/-/object.assign-4.1.7.tgz", + "integrity": "sha512-nK28WOo+QIjBkDduTINE4JkF/UJJKyf2EJxvJKfblDpyg0Q+pkOHNTL0Qwy6NP6FhE/EnzV73BxxqcJaXY9anw==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bind": "^1.0.8", + "call-bound": "^1.0.3", + "define-properties": "^1.2.1", + "es-object-atoms": "^1.0.0", + "has-symbols": "^1.1.0", + "object-keys": "^1.1.1" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, "node_modules/obug": { "version": "2.1.2", "resolved": "https://registry.npmjs.org/obug/-/obug-2.1.2.tgz", @@ -8970,6 +13129,93 @@ "node": ">= 0.8.0" } }, + "node_modules/own-keys": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/own-keys/-/own-keys-1.0.1.tgz", + "integrity": "sha512-qFOyK5PjiWZd+QQIh+1jhdb9LpxTF0qs7Pm8o5QHYZ0M3vKqSqzsZaEB6oWlxZ+q2sJBMI/Ktgd2N5ZwQoRHfg==", + "dev": true, + "license": "MIT", + "dependencies": { + "get-intrinsic": "^1.2.6", + "object-keys": "^1.1.1", + "safe-push-apply": "^1.0.0" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/oxc-parser": { + "version": "0.127.0", + "resolved": "https://registry.npmjs.org/oxc-parser/-/oxc-parser-0.127.0.tgz", + "integrity": "sha512-bkgD4qHlN7WxLdX8bLXdaU54TtQtAIg/ZBAfm0aje/mo3MRDo3P0hZSgr4U7O3xfX+fQmR5AP04JS/TGcZLcFA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@oxc-project/types": "^0.127.0" + }, + "engines": { + "node": "^20.19.0 || >=22.12.0" + }, + "funding": { + "url": "https://github.com/sponsors/Boshen" + }, + "optionalDependencies": { + "@oxc-parser/binding-android-arm-eabi": "0.127.0", + "@oxc-parser/binding-android-arm64": "0.127.0", + "@oxc-parser/binding-darwin-arm64": "0.127.0", + "@oxc-parser/binding-darwin-x64": "0.127.0", + "@oxc-parser/binding-freebsd-x64": "0.127.0", + "@oxc-parser/binding-linux-arm-gnueabihf": "0.127.0", + "@oxc-parser/binding-linux-arm-musleabihf": "0.127.0", + "@oxc-parser/binding-linux-arm64-gnu": "0.127.0", + "@oxc-parser/binding-linux-arm64-musl": "0.127.0", + "@oxc-parser/binding-linux-ppc64-gnu": "0.127.0", + "@oxc-parser/binding-linux-riscv64-gnu": "0.127.0", + "@oxc-parser/binding-linux-riscv64-musl": "0.127.0", + "@oxc-parser/binding-linux-s390x-gnu": "0.127.0", + "@oxc-parser/binding-linux-x64-gnu": "0.127.0", + "@oxc-parser/binding-linux-x64-musl": "0.127.0", + "@oxc-parser/binding-openharmony-arm64": "0.127.0", + "@oxc-parser/binding-wasm32-wasi": "0.127.0", + "@oxc-parser/binding-win32-arm64-msvc": "0.127.0", + "@oxc-parser/binding-win32-ia32-msvc": "0.127.0", + "@oxc-parser/binding-win32-x64-msvc": "0.127.0" + } + }, + "node_modules/oxc-resolver": { + "version": "11.20.0", + "resolved": "https://registry.npmjs.org/oxc-resolver/-/oxc-resolver-11.20.0.tgz", + "integrity": "sha512-CblytBiV/a/ZXY34dsVU2NxhIOxMXst8CvDCtyBelVITgd7PLrKzbEbA6oKLdPjvDKDzCiW48qzmzZ+mYaqn+g==", + "dev": true, + "license": "MIT", + "funding": { + "url": "https://github.com/sponsors/Boshen" + }, + "optionalDependencies": { + "@oxc-resolver/binding-android-arm-eabi": "11.20.0", + "@oxc-resolver/binding-android-arm64": "11.20.0", + "@oxc-resolver/binding-darwin-arm64": "11.20.0", + "@oxc-resolver/binding-darwin-x64": "11.20.0", + "@oxc-resolver/binding-freebsd-x64": "11.20.0", + "@oxc-resolver/binding-linux-arm-gnueabihf": "11.20.0", + "@oxc-resolver/binding-linux-arm-musleabihf": "11.20.0", + "@oxc-resolver/binding-linux-arm64-gnu": "11.20.0", + "@oxc-resolver/binding-linux-arm64-musl": "11.20.0", + "@oxc-resolver/binding-linux-ppc64-gnu": "11.20.0", + "@oxc-resolver/binding-linux-riscv64-gnu": "11.20.0", + "@oxc-resolver/binding-linux-riscv64-musl": "11.20.0", + "@oxc-resolver/binding-linux-s390x-gnu": "11.20.0", + "@oxc-resolver/binding-linux-x64-gnu": "11.20.0", + "@oxc-resolver/binding-linux-x64-musl": "11.20.0", + "@oxc-resolver/binding-openharmony-arm64": "11.20.0", + "@oxc-resolver/binding-wasm32-wasi": "11.20.0", + "@oxc-resolver/binding-win32-arm64-msvc": "11.20.0", + "@oxc-resolver/binding-win32-x64-msvc": "11.20.0" + } + }, "node_modules/p-limit": { "version": "3.1.0", "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-3.1.0.tgz", @@ -9002,6 +13248,13 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/package-json-from-dist": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/package-json-from-dist/-/package-json-from-dist-1.0.1.tgz", + "integrity": "sha512-UEZIS3/by4OC8vL3P2dTXRETpebLI2NiI5vIrjaD/5UtrkFX/tNbwjTSRAGC/+7CAo2pIcBaRgWmcBBHcsaCIw==", + "dev": true, + "license": "BlueOak-1.0.0" + }, "node_modules/package-manager-detector": { "version": "1.6.0", "resolved": "https://registry.npmjs.org/package-manager-detector/-/package-manager-detector-1.6.0.tgz", @@ -9072,6 +13325,13 @@ "node": ">=8" } }, + "node_modules/path-parse": { + "version": "1.0.7", + "resolved": "https://registry.npmjs.org/path-parse/-/path-parse-1.0.7.tgz", + "integrity": "sha512-LDJzPVEEEPR+y48z93A0Ed0yXb8pAByGWo/k5YYdYgpY2/2EsOsksJrq7lOHxryrVOn1ejG6oAp8ahvOIQD8sw==", + "dev": true, + "license": "MIT" + }, "node_modules/path-to-regexp": { "version": "8.4.2", "resolved": "https://registry.npmjs.org/path-to-regexp/-/path-to-regexp-8.4.2.tgz", @@ -9217,10 +13477,20 @@ "node": ">= 10.12" } }, + "node_modules/possible-typed-array-names": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/possible-typed-array-names/-/possible-typed-array-names-1.1.0.tgz", + "integrity": "sha512-/+5VFTchJDoVj3bhoqi6UeymcD00DAwb1nJwamzPvHEszJ4FpF6SNNbUbOS8yI56qHzdV8eK0qEfOSiodkTdxg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 0.4" + } + }, "node_modules/postcss": { - "version": "8.5.14", - "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.5.14.tgz", - "integrity": "sha512-SoSL4+OSEtR99LHFZQiJLkT59C5B1amGO1NzTwj7TT1qCUgUO6hxOvzkOYxD+vMrXBM3XJIKzokoERdqQq/Zmg==", + "version": "8.5.15", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.5.15.tgz", + "integrity": "sha512-FfR8sjd4em2T6fb3I2MwAJU7HWVMr9zba+enmQeeWFfCbm+UOC/0X4DS8XtpUTMwWMGbjKYP7xjfNekzyGmB3A==", "dev": true, "funding": [ { @@ -9238,7 +13508,7 @@ ], "license": "MIT", "dependencies": { - "nanoid": "^3.3.11", + "nanoid": "^3.3.12", "picocolors": "^1.1.1", "source-map-js": "^1.2.1" }, @@ -9365,9 +13635,9 @@ } }, "node_modules/prettier": { - "version": "3.6.2", - "resolved": "https://registry.npmjs.org/prettier/-/prettier-3.6.2.tgz", - "integrity": "sha512-I7AIg5boAr5R0FFtJ6rCfD+LFsWHp81dolrFD8S79U9tb8Az2nGrJncnMSnys+bpQJfRUzqs9hnA81OAA3hCuQ==", + "version": "3.8.3", + "resolved": "https://registry.npmjs.org/prettier/-/prettier-3.8.3.tgz", + "integrity": "sha512-7igPTM53cGHMW8xWuVTydi2KO233VFiTNyF5hLJqpilHfmn8C8gPf+PS7dUT64YcXFbiMGZxS9pCSxL/Dxm/Jw==", "dev": true, "license": "MIT", "bin": { @@ -9381,24 +13651,27 @@ } }, "node_modules/prettier-plugin-svelte": { - "version": "3.4.0", - "resolved": "https://registry.npmjs.org/prettier-plugin-svelte/-/prettier-plugin-svelte-3.4.0.tgz", - "integrity": "sha512-pn1ra/0mPObzqoIQn/vUTR3ZZI6UuZ0sHqMK5x2jMLGrs53h0sXhkVuDcrlssHwIMk7FYrMjHBPoUSyyEEDlBQ==", + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/prettier-plugin-svelte/-/prettier-plugin-svelte-4.1.0.tgz", + "integrity": "sha512-YZkhA2Q9oOerFFG9tq+2f98WYT7Z2JgrybJrAyrB78jpsH9i/DdgplXemehuFPgsldetFNCcR/yCcYlDjPy94Q==", "dev": true, "license": "MIT", + "engines": { + "node": ">=20" + }, "peerDependencies": { "prettier": "^3.0.0", - "svelte": "^3.2.0 || ^4.0.0-next.0 || ^5.0.0-next.0" + "svelte": "^5.0.0" } }, "node_modules/prettier-plugin-tailwindcss": { - "version": "0.6.14", - "resolved": "https://registry.npmjs.org/prettier-plugin-tailwindcss/-/prettier-plugin-tailwindcss-0.6.14.tgz", - "integrity": "sha512-pi2e/+ZygeIqntN+vC573BcW5Cve8zUB0SSAGxqpB4f96boZF4M3phPVoOFCeypwkpRYdi7+jQ5YJJUwrkGUAg==", + "version": "0.8.0", + "resolved": "https://registry.npmjs.org/prettier-plugin-tailwindcss/-/prettier-plugin-tailwindcss-0.8.0.tgz", + "integrity": "sha512-V8ITGH87yuBDF6JpEZTOVlUz/saAwqb8f3HRgUj8Lh+tGCcrmorhsLpYqzygwFwK0PE2Ib6Mv3M7T/uE2tZV1g==", "dev": true, "license": "MIT", "engines": { - "node": ">=14.21.3" + "node": ">=20.19" }, "peerDependencies": { "@ianvs/prettier-plugin-sort-imports": "*", @@ -9411,14 +13684,12 @@ "prettier": "^3.0", "prettier-plugin-astro": "*", "prettier-plugin-css-order": "*", - "prettier-plugin-import-sort": "*", "prettier-plugin-jsdoc": "*", "prettier-plugin-marko": "*", "prettier-plugin-multiline-arrays": "*", "prettier-plugin-organize-attributes": "*", "prettier-plugin-organize-imports": "*", "prettier-plugin-sort-imports": "*", - "prettier-plugin-style-order": "*", "prettier-plugin-svelte": "*" }, "peerDependenciesMeta": { @@ -9449,9 +13720,6 @@ "prettier-plugin-css-order": { "optional": true }, - "prettier-plugin-import-sort": { - "optional": true - }, "prettier-plugin-jsdoc": { "optional": true }, @@ -9470,14 +13738,24 @@ "prettier-plugin-sort-imports": { "optional": true }, - "prettier-plugin-style-order": { - "optional": true - }, "prettier-plugin-svelte": { "optional": true } } }, + "node_modules/pretty-bytes": { + "version": "6.1.1", + "resolved": "https://registry.npmjs.org/pretty-bytes/-/pretty-bytes-6.1.1.tgz", + "integrity": "sha512-mQUvGU6aUFQ+rNvTIAcZuWGRT9a6f6Yrg9bHs4ImKF+HZCEK+plBvnAZYSIQztknZF2qnzNtr6F8s0+IuptdlQ==", + "dev": true, + "license": "MIT", + "engines": { + "node": "^14.13.1 || >=16.0.0" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/pretty-format": { "version": "27.5.1", "resolved": "https://registry.npmjs.org/pretty-format/-/pretty-format-27.5.1.tgz", @@ -9526,9 +13804,9 @@ } }, "node_modules/property-information": { - "version": "7.1.0", - "resolved": "https://registry.npmjs.org/property-information/-/property-information-7.1.0.tgz", - "integrity": "sha512-TwEZ+X+yCJmYfL7TPUOcvBZ4QfoT5YenQiJuX//0th53DE6w0xxLEtfK3iyryQFddXuvkIk51EEgrJQ0WJkOmQ==", + "version": "7.2.0", + "resolved": "https://registry.npmjs.org/property-information/-/property-information-7.2.0.tgz", + "integrity": "sha512-IAtzIB6sUiWaJYrX9smp3V46pBGbBeLFRGdh25kg1334VcBlD8HzhPeNIWQH9zhGmo2itIe25EHt9dQP7G5hmg==", "dev": true, "license": "MIT", "funding": { @@ -9576,6 +13854,23 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/quansync": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/quansync/-/quansync-1.0.0.tgz", + "integrity": "sha512-5xZacEEufv3HSTPQuchrvV6soaiACMFnq1H8wkVioctoH3TRha9Sz66lOxRwPK/qZj7HPiSveih9yAyh98gvqA==", + "dev": true, + "funding": [ + { + "type": "individual", + "url": "https://github.com/sponsors/antfu" + }, + { + "type": "individual", + "url": "https://github.com/sponsors/sxzz" + } + ], + "license": "MIT" + }, "node_modules/range-parser": { "version": "1.2.1", "resolved": "https://registry.npmjs.org/range-parser/-/range-parser-1.2.1.tgz", @@ -9602,27 +13897,10 @@ "node": ">= 0.10" } }, - "node_modules/raw-body/node_modules/iconv-lite": { - "version": "0.7.2", - "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.7.2.tgz", - "integrity": "sha512-im9DjEDQ55s9fL4EYzOAv0yMqmMBSZp6G0VvFyTMPKWxiSBHUj9NW/qqLmXUwXrrM7AvqSlTCfvqRb0cM8yYqw==", - "dev": true, - "license": "MIT", - "dependencies": { - "safer-buffer": ">= 2.1.2 < 3.0.0" - }, - "engines": { - "node": ">=0.10.0" - }, - "funding": { - "type": "opencollective", - "url": "https://opencollective.com/express" - } - }, "node_modules/react": { - "version": "19.1.0", - "resolved": "https://registry.npmjs.org/react/-/react-19.1.0.tgz", - "integrity": "sha512-FS+XFBNvn3GTAWq26joslQgWNoFu08F4kl0J4CgdNKADkdSGXQyTCnKteIAJy96Br6YbpEU1LSzV5dYtjMkMDg==", + "version": "19.2.7", + "resolved": "https://registry.npmjs.org/react/-/react-19.2.7.tgz", + "integrity": "sha512-HNe9WslTbXmFK8o8cmwgAeJFSBvt1bPdHCVKtaaV+WlAN36mpT4hcRpwbf3fY56ar2oIXzsBpOAiIRHAdY0OlQ==", "dev": true, "license": "MIT", "engines": { @@ -9630,16 +13908,16 @@ } }, "node_modules/react-dom": { - "version": "19.1.0", - "resolved": "https://registry.npmjs.org/react-dom/-/react-dom-19.1.0.tgz", - "integrity": "sha512-Xs1hdnE+DyKgeHJeJznQmYMIBG3TKIHJJT95Q58nHLSrElKlGQqDTR2HQ9fx5CN/Gk6Vh/kupBTDLU11/nDk/g==", + "version": "19.2.7", + "resolved": "https://registry.npmjs.org/react-dom/-/react-dom-19.2.7.tgz", + "integrity": "sha512-t0BRVXvbiE/o20Hfw669rLbMCDWtYZLvmJigy2f0MxsXF+71pxhR3xOkspmsO8h3ZlNzyibAmtCa3l4lYKk6gQ==", "dev": true, "license": "MIT", "dependencies": { - "scheduler": "^0.26.0" + "scheduler": "^0.27.0" }, "peerDependencies": { - "react": "^19.1.0" + "react": "^19.2.7" } }, "node_modules/react-is": { @@ -9651,13 +13929,13 @@ "peer": true }, "node_modules/readdirp": { - "version": "4.1.2", - "resolved": "https://registry.npmjs.org/readdirp/-/readdirp-4.1.2.tgz", - "integrity": "sha512-GDhwkLfywWL2s6vEjyhri+eXmfH6j1L7JE27WhqLeYzoh/A3DBaYGEj2H/HFZCn/kMfim73FXxEJTw06WtxQwg==", + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/readdirp/-/readdirp-5.0.0.tgz", + "integrity": "sha512-9u/XQ1pvrQtYyMpZe7DXKv2p5CNvyVwzUB6uhLAnQwHMSgKMBR62lc7AHljaeteeHXn11XTAaLLUVZYVZyuRBQ==", "dev": true, "license": "MIT", "engines": { - "node": ">= 14.18.0" + "node": ">= 20.19.0" }, "funding": { "type": "individual", @@ -9681,18 +13959,120 @@ "node": ">= 4" } }, - "node_modules/redent": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/redent/-/redent-3.0.0.tgz", - "integrity": "sha512-6tDA8g98We0zd0GvVeMT9arEOnTw9qM03L9cJXaCjrip1OO764RDBLBfrB4cwzNGDj5OA5ioymC9GkizgWJDUg==", + "node_modules/redent": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/redent/-/redent-3.0.0.tgz", + "integrity": "sha512-6tDA8g98We0zd0GvVeMT9arEOnTw9qM03L9cJXaCjrip1OO764RDBLBfrB4cwzNGDj5OA5ioymC9GkizgWJDUg==", + "dev": true, + "license": "MIT", + "dependencies": { + "indent-string": "^4.0.0", + "strip-indent": "^3.0.0" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/reflect.getprototypeof": { + "version": "1.0.10", + "resolved": "https://registry.npmjs.org/reflect.getprototypeof/-/reflect.getprototypeof-1.0.10.tgz", + "integrity": "sha512-00o4I+DVrefhv+nX0ulyi3biSHCPDe+yLv5o/p6d/UVlirijB8E16FtfwSAi4g3tcqrQ4lRAqQSoFEZJehYEcw==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bind": "^1.0.8", + "define-properties": "^1.2.1", + "es-abstract": "^1.23.9", + "es-errors": "^1.3.0", + "es-object-atoms": "^1.0.0", + "get-intrinsic": "^1.2.7", + "get-proto": "^1.0.1", + "which-builtin-type": "^1.2.1" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/regenerate": { + "version": "1.4.2", + "resolved": "https://registry.npmjs.org/regenerate/-/regenerate-1.4.2.tgz", + "integrity": "sha512-zrceR/XhGYU/d/opr2EKO7aRHUeiBI8qjtfHqADTwZd6Szfy16la6kqD0MIUs5z5hx6AaKa+PixpPrR289+I0A==", + "dev": true, + "license": "MIT" + }, + "node_modules/regenerate-unicode-properties": { + "version": "10.2.2", + "resolved": "https://registry.npmjs.org/regenerate-unicode-properties/-/regenerate-unicode-properties-10.2.2.tgz", + "integrity": "sha512-m03P+zhBeQd1RGnYxrGyDAPpWX/epKirLrp8e3qevZdVkKtnCrjjWczIbYc8+xd6vcTStVlqfycTx1KR4LOr0g==", + "dev": true, + "license": "MIT", + "dependencies": { + "regenerate": "^1.4.2" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/regexp.prototype.flags": { + "version": "1.5.4", + "resolved": "https://registry.npmjs.org/regexp.prototype.flags/-/regexp.prototype.flags-1.5.4.tgz", + "integrity": "sha512-dYqgNSZbDwkaJ2ceRd9ojCGjBq+mOm9LmtXnAnEGyHhN/5R7iDW2TRw3h+o/jCFxus3P2LfWIIiwowAjANm7IA==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bind": "^1.0.8", + "define-properties": "^1.2.1", + "es-errors": "^1.3.0", + "get-proto": "^1.0.1", + "gopd": "^1.2.0", + "set-function-name": "^2.0.2" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/regexpu-core": { + "version": "6.4.0", + "resolved": "https://registry.npmjs.org/regexpu-core/-/regexpu-core-6.4.0.tgz", + "integrity": "sha512-0ghuzq67LI9bLXpOX/ISfve/Mq33a4aFRzoQYhnnok1JOFpmE/A2TBGkNVenOGEeSBCjIiWcc6MVOG5HEQv0sA==", + "dev": true, + "license": "MIT", + "dependencies": { + "regenerate": "^1.4.2", + "regenerate-unicode-properties": "^10.2.2", + "regjsgen": "^0.8.0", + "regjsparser": "^0.13.0", + "unicode-match-property-ecmascript": "^2.0.0", + "unicode-match-property-value-ecmascript": "^2.2.1" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/regjsgen": { + "version": "0.8.0", + "resolved": "https://registry.npmjs.org/regjsgen/-/regjsgen-0.8.0.tgz", + "integrity": "sha512-RvwtGe3d7LvWiDQXeQw8p5asZUmfU1G/l6WbUXeHta7Y2PEIvBTwH6E2EfmYUK8pxcxEdEmaomqyp0vZZ7C+3Q==", + "dev": true, + "license": "MIT" + }, + "node_modules/regjsparser": { + "version": "0.13.1", + "resolved": "https://registry.npmjs.org/regjsparser/-/regjsparser-0.13.1.tgz", + "integrity": "sha512-dLsljMd9sqwRkby8zhO1gSg3PnJIBFid8f4CQj/sXx+7cKx+E7u0PKhZ+U4wmhx7EfmtvnA318oVaIkAB1lRJw==", "dev": true, - "license": "MIT", + "license": "BSD-2-Clause", "dependencies": { - "indent-string": "^4.0.0", - "strip-indent": "^3.0.0" + "jsesc": "~3.1.0" }, - "engines": { - "node": ">=8" + "bin": { + "regjsparser": "bin/parser" } }, "node_modules/rehype-highlight": { @@ -9904,6 +14284,28 @@ "dev": true, "license": "MIT" }, + "node_modules/resolve": { + "version": "1.22.12", + "resolved": "https://registry.npmjs.org/resolve/-/resolve-1.22.12.tgz", + "integrity": "sha512-TyeJ1zif53BPfHootBGwPRYT1RUt6oGWsaQr8UyZW/eAm9bKoijtvruSDEmZHm92CwS9nj7/fWttqPCgzep8CA==", + "dev": true, + "license": "MIT", + "dependencies": { + "es-errors": "^1.3.0", + "is-core-module": "^2.16.1", + "path-parse": "^1.0.7", + "supports-preserve-symlinks-flag": "^1.0.0" + }, + "bin": { + "resolve": "bin/resolve" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, "node_modules/resolve-from": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/resolve-from/-/resolve-from-4.0.0.tgz", @@ -9922,13 +14324,13 @@ "license": "Unlicense" }, "node_modules/rollup": { - "version": "4.60.1", - "resolved": "https://registry.npmjs.org/rollup/-/rollup-4.60.1.tgz", - "integrity": "sha512-VmtB2rFU/GroZ4oL8+ZqXgSA38O6GR8KSIvWmEFv63pQ0G6KaBH9s07PO8XTXP4vI+3UJUEypOfjkGfmSBBR0w==", + "version": "4.61.1", + "resolved": "https://registry.npmjs.org/rollup/-/rollup-4.61.1.tgz", + "integrity": "sha512-I4KW6iuRpuu2uHBLraZ1wNZe0DP7lnRha+VJ9tNaYVaVgKhW0aI3h4RYnoRPeql0flHm/Co55b7snEDcOfOJrA==", "dev": true, "license": "MIT", "dependencies": { - "@types/estree": "1.0.8" + "@types/estree": "1.0.9" }, "bin": { "rollup": "dist/bin/rollup" @@ -9938,31 +14340,31 @@ "npm": ">=8.0.0" }, "optionalDependencies": { - "@rollup/rollup-android-arm-eabi": "4.60.1", - "@rollup/rollup-android-arm64": "4.60.1", - "@rollup/rollup-darwin-arm64": "4.60.1", - "@rollup/rollup-darwin-x64": "4.60.1", - "@rollup/rollup-freebsd-arm64": "4.60.1", - "@rollup/rollup-freebsd-x64": "4.60.1", - "@rollup/rollup-linux-arm-gnueabihf": "4.60.1", - "@rollup/rollup-linux-arm-musleabihf": "4.60.1", - "@rollup/rollup-linux-arm64-gnu": "4.60.1", - "@rollup/rollup-linux-arm64-musl": "4.60.1", - "@rollup/rollup-linux-loong64-gnu": "4.60.1", - "@rollup/rollup-linux-loong64-musl": "4.60.1", - "@rollup/rollup-linux-ppc64-gnu": "4.60.1", - "@rollup/rollup-linux-ppc64-musl": "4.60.1", - "@rollup/rollup-linux-riscv64-gnu": "4.60.1", - "@rollup/rollup-linux-riscv64-musl": "4.60.1", - "@rollup/rollup-linux-s390x-gnu": "4.60.1", - "@rollup/rollup-linux-x64-gnu": "4.60.1", - "@rollup/rollup-linux-x64-musl": "4.60.1", - "@rollup/rollup-openbsd-x64": "4.60.1", - "@rollup/rollup-openharmony-arm64": "4.60.1", - "@rollup/rollup-win32-arm64-msvc": "4.60.1", - "@rollup/rollup-win32-ia32-msvc": "4.60.1", - "@rollup/rollup-win32-x64-gnu": "4.60.1", - "@rollup/rollup-win32-x64-msvc": "4.60.1", + "@rollup/rollup-android-arm-eabi": "4.61.1", + "@rollup/rollup-android-arm64": "4.61.1", + "@rollup/rollup-darwin-arm64": "4.61.1", + "@rollup/rollup-darwin-x64": "4.61.1", + "@rollup/rollup-freebsd-arm64": "4.61.1", + "@rollup/rollup-freebsd-x64": "4.61.1", + "@rollup/rollup-linux-arm-gnueabihf": "4.61.1", + "@rollup/rollup-linux-arm-musleabihf": "4.61.1", + "@rollup/rollup-linux-arm64-gnu": "4.61.1", + "@rollup/rollup-linux-arm64-musl": "4.61.1", + "@rollup/rollup-linux-loong64-gnu": "4.61.1", + "@rollup/rollup-linux-loong64-musl": "4.61.1", + "@rollup/rollup-linux-ppc64-gnu": "4.61.1", + "@rollup/rollup-linux-ppc64-musl": "4.61.1", + "@rollup/rollup-linux-riscv64-gnu": "4.61.1", + "@rollup/rollup-linux-riscv64-musl": "4.61.1", + "@rollup/rollup-linux-s390x-gnu": "4.61.1", + "@rollup/rollup-linux-x64-gnu": "4.61.1", + "@rollup/rollup-linux-x64-musl": "4.61.1", + "@rollup/rollup-openbsd-x64": "4.61.1", + "@rollup/rollup-openharmony-arm64": "4.61.1", + "@rollup/rollup-win32-arm64-msvc": "4.61.1", + "@rollup/rollup-win32-ia32-msvc": "4.61.1", + "@rollup/rollup-win32-x64-gnu": "4.61.1", + "@rollup/rollup-win32-x64-msvc": "4.61.1", "fsevents": "~2.3.2" } }, @@ -10010,19 +14412,28 @@ } }, "node_modules/runed": { - "version": "0.25.0", - "resolved": "https://registry.npmjs.org/runed/-/runed-0.25.0.tgz", - "integrity": "sha512-7+ma4AG9FT2sWQEA0Egf6mb7PBT2vHyuHail1ie8ropfSjvZGtEAx8YTmUjv/APCsdRRxEVvArNjALk9zFSOrg==", + "version": "0.35.1", + "resolved": "https://registry.npmjs.org/runed/-/runed-0.35.1.tgz", + "integrity": "sha512-2F4Q/FZzbeJTFdIS/PuOoPRSm92sA2LhzTnv6FXhCoENb3huf5+fDuNOg1LNvGOouy3u/225qxmuJvcV3IZK5Q==", "dev": true, "funding": [ "https://github.com/sponsors/huntabyte", "https://github.com/sponsors/tglide" ], + "license": "MIT", "dependencies": { - "esm-env": "^1.0.0" + "dequal": "^2.0.3", + "esm-env": "^1.0.0", + "lz-string": "^1.5.0" }, "peerDependencies": { + "@sveltejs/kit": "^2.21.0", "svelte": "^5.7.0" + }, + "peerDependenciesMeta": { + "@sveltejs/kit": { + "optional": true + } } }, "node_modules/rw": { @@ -10045,6 +14456,26 @@ "node": ">=6" } }, + "node_modules/safe-array-concat": { + "version": "1.1.4", + "resolved": "https://registry.npmjs.org/safe-array-concat/-/safe-array-concat-1.1.4.tgz", + "integrity": "sha512-wtZlHyOje6OZTGqAoaDKxFkgRtkF9CnHAVnCHKfuj200wAgL+bSJhdsCD2l0Qx/2ekEXjPWcyKkfGb5CPboslg==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bind": "^1.0.9", + "call-bound": "^1.0.4", + "get-intrinsic": "^1.3.0", + "has-symbols": "^1.1.0", + "isarray": "^2.0.5" + }, + "engines": { + "node": ">=0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, "node_modules/safe-buffer": { "version": "5.1.2", "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz", @@ -10052,6 +14483,41 @@ "dev": true, "license": "MIT" }, + "node_modules/safe-push-apply": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/safe-push-apply/-/safe-push-apply-1.0.0.tgz", + "integrity": "sha512-iKE9w/Z7xCzUMIZqdBsp6pEQvwuEebH4vdpjcDWnyzaI6yl6O9FHvVpmGelvEHNsoY6wGblkxR6Zty/h00WiSA==", + "dev": true, + "license": "MIT", + "dependencies": { + "es-errors": "^1.3.0", + "isarray": "^2.0.5" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/safe-regex-test": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/safe-regex-test/-/safe-regex-test-1.1.0.tgz", + "integrity": "sha512-x/+Cz4YrimQxQccJf5mKEbIa1NzeCRNI5Ecl/ekmlYaampdNLPalVyIcCZNNH3MvmqBugV5TMYZXv0ljslUlaw==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bound": "^1.0.2", + "es-errors": "^1.3.0", + "is-regex": "^1.2.1" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, "node_modules/safer-buffer": { "version": "2.1.2", "resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz", @@ -10060,30 +14526,30 @@ "license": "MIT" }, "node_modules/sass": { - "version": "1.93.3", - "resolved": "https://registry.npmjs.org/sass/-/sass-1.93.3.tgz", - "integrity": "sha512-elOcIZRTM76dvxNAjqYrucTSI0teAF/L2Lv0s6f6b7FOwcwIuA357bIE871580AjHJuSvLIRUosgV+lIWx6Rgg==", + "version": "1.100.0", + "resolved": "https://registry.npmjs.org/sass/-/sass-1.100.0.tgz", + "integrity": "sha512-B5j0rYMlinhhOo9tjQebMVVn0TfyXAF+wB3b2ggZUuJ/is/Y+7+JGjirAMxHZ9Z3hIP98NPfamlAkBHa1lAaXQ==", "dev": true, "license": "MIT", "dependencies": { - "chokidar": "^4.0.0", - "immutable": "^5.0.2", + "chokidar": "^5.0.0", + "immutable": "^5.1.5", "source-map-js": ">=0.6.2 <2.0.0" }, "bin": { "sass": "sass.js" }, "engines": { - "node": ">=14.0.0" + "node": ">=20.19.0" }, "optionalDependencies": { "@parcel/watcher": "^2.4.1" } }, "node_modules/scheduler": { - "version": "0.26.0", - "resolved": "https://registry.npmjs.org/scheduler/-/scheduler-0.26.0.tgz", - "integrity": "sha512-NlHwttCI/l5gCPR3D1nNXtWABUmBwvZpEQiD4IXSbIDq8BzLIK/7Ir5gTFSGZDUu37K5cMNp0hFtzO38sC7gWA==", + "version": "0.27.0", + "resolved": "https://registry.npmjs.org/scheduler/-/scheduler-0.27.0.tgz", + "integrity": "sha512-eNv+WrVbKu1f3vbYJT/xtiF5syA5HPIMtf9IgY/nKg0sWqzAUEvqY/xm7OcZc/qafLx/iO9FgOmeSAp4v5ti/Q==", "dev": true, "license": "MIT" }, @@ -10102,9 +14568,9 @@ "license": "MIT" }, "node_modules/semver": { - "version": "7.7.3", - "resolved": "https://registry.npmjs.org/semver/-/semver-7.7.3.tgz", - "integrity": "sha512-SdsKMrI9TdgjdweUSR9MweHA4EJ8YxHn8DFaDisvhVlUOe4BF1tLD7GAj0lIqWVl+dPb/rExr0Btby5loQm20Q==", + "version": "7.8.1", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.8.1.tgz", + "integrity": "sha512-rkVq3IXh+4FDGch+KwzX3aV9W3kO54GyEgpvBzSyctDA6Xtd7RJQV1xmXbeQp5v7+VzLOfVqiutSE6GICgPFvg==", "dev": true, "license": "ISC", "bin": { @@ -10141,6 +14607,16 @@ "url": "https://opencollective.com/express" } }, + "node_modules/serialize-javascript": { + "version": "7.0.5", + "resolved": "https://registry.npmjs.org/serialize-javascript/-/serialize-javascript-7.0.5.tgz", + "integrity": "sha512-F4LcB0UqUl1zErq+1nYEEzSHJnIwb3AF2XWB94b+afhrekOUijwooAYqFyRbjYkm2PAKBabx6oYv/xDxNi8IBw==", + "dev": true, + "license": "BSD-3-Clause", + "engines": { + "node": ">=20.0.0" + } + }, "node_modules/serve-static": { "version": "2.2.1", "resolved": "https://registry.npmjs.org/serve-static/-/serve-static-2.2.1.tgz", @@ -10162,12 +14638,61 @@ } }, "node_modules/set-cookie-parser": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/set-cookie-parser/-/set-cookie-parser-3.0.1.tgz", - "integrity": "sha512-n7Z7dXZhJbwuAHhNzkTti6Aw9QDDjZtm3JTpTGATIdNzdQz5GuFs22w90BcvF4INfnrL5xrX3oGsuqO5Dx3A1Q==", + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/set-cookie-parser/-/set-cookie-parser-3.1.0.tgz", + "integrity": "sha512-kjnC1DXBHcxaOaOXBHBeRtltsDG2nUiUni+jP92M9gYdW12rsmx92UsfpH7o5tDRs7I1ZZPSQJQGv3UaRfCiuw==", "dev": true, "license": "MIT" }, + "node_modules/set-function-length": { + "version": "1.2.2", + "resolved": "https://registry.npmjs.org/set-function-length/-/set-function-length-1.2.2.tgz", + "integrity": "sha512-pgRc4hJ4/sNjWCSS9AmnS40x3bNMDTknHgL5UaMBTMyJnU90EgWh1Rz+MC9eFu4BuN/UwZjKQuY/1v3rM7HMfg==", + "dev": true, + "license": "MIT", + "dependencies": { + "define-data-property": "^1.1.4", + "es-errors": "^1.3.0", + "function-bind": "^1.1.2", + "get-intrinsic": "^1.2.4", + "gopd": "^1.0.1", + "has-property-descriptors": "^1.0.2" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/set-function-name": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/set-function-name/-/set-function-name-2.0.2.tgz", + "integrity": "sha512-7PGFlmtwsEADb0WYyvCMa1t+yke6daIG4Wirafur5kcf+MhUnPms1UeR0CKQdTZD81yESwMHbtn+TR+dMviakQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "define-data-property": "^1.1.4", + "es-errors": "^1.3.0", + "functions-have-names": "^1.2.3", + "has-property-descriptors": "^1.0.2" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/set-proto": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/set-proto/-/set-proto-1.0.0.tgz", + "integrity": "sha512-RJRdvCo6IAnPdsvP/7m6bsQqNnn1FCBX5ZNtFL98MmFF/4xAIJTIg1YbHW5DC2W5SKZanrC6i4HsJqlajw/dZw==", + "dev": true, + "license": "MIT", + "dependencies": { + "dunder-proto": "^1.0.1", + "es-errors": "^1.3.0", + "es-object-atoms": "^1.0.0" + }, + "engines": { + "node": ">= 0.4" + } + }, "node_modules/setprototypeof": { "version": "1.2.0", "resolved": "https://registry.npmjs.org/setprototypeof/-/setprototypeof-1.2.0.tgz", @@ -10175,6 +14700,58 @@ "dev": true, "license": "ISC" }, + "node_modules/sharp": { + "version": "0.33.5", + "resolved": "https://registry.npmjs.org/sharp/-/sharp-0.33.5.tgz", + "integrity": "sha512-haPVm1EkS9pgvHrQ/F3Xy+hgcuMV0Wm9vfIBSiwZ05k+xgb0PkBQpGsAA/oWdDobNaZTH5ppvHtzCFbnSEwHVw==", + "dev": true, + "hasInstallScript": true, + "license": "Apache-2.0", + "dependencies": { + "color": "^4.2.3", + "detect-libc": "^2.0.3", + "semver": "^7.6.3" + }, + "engines": { + "node": "^18.17.0 || ^20.3.0 || >=21.0.0" + }, + "funding": { + "url": "https://opencollective.com/libvips" + }, + "optionalDependencies": { + "@img/sharp-darwin-arm64": "0.33.5", + "@img/sharp-darwin-x64": "0.33.5", + "@img/sharp-libvips-darwin-arm64": "1.0.4", + "@img/sharp-libvips-darwin-x64": "1.0.4", + "@img/sharp-libvips-linux-arm": "1.0.5", + "@img/sharp-libvips-linux-arm64": "1.0.4", + "@img/sharp-libvips-linux-s390x": "1.0.4", + "@img/sharp-libvips-linux-x64": "1.0.4", + "@img/sharp-libvips-linuxmusl-arm64": "1.0.4", + "@img/sharp-libvips-linuxmusl-x64": "1.0.4", + "@img/sharp-linux-arm": "0.33.5", + "@img/sharp-linux-arm64": "0.33.5", + "@img/sharp-linux-s390x": "0.33.5", + "@img/sharp-linux-x64": "0.33.5", + "@img/sharp-linuxmusl-arm64": "0.33.5", + "@img/sharp-linuxmusl-x64": "0.33.5", + "@img/sharp-wasm32": "0.33.5", + "@img/sharp-win32-ia32": "0.33.5", + "@img/sharp-win32-x64": "0.33.5" + } + }, + "node_modules/sharp-ico": { + "version": "0.1.5", + "resolved": "https://registry.npmjs.org/sharp-ico/-/sharp-ico-0.1.5.tgz", + "integrity": "sha512-a3jODQl82NPp1d5OYb0wY+oFaPk7AvyxipIowCHk7pBsZCWgbe0yAkU2OOXdoH0ENyANhyOQbs9xkAiRHcF02Q==", + "dev": true, + "license": "MIT", + "dependencies": { + "decode-ico": "*", + "ico-endec": "*", + "sharp": "*" + } + }, "node_modules/shebang-command": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/shebang-command/-/shebang-command-2.0.0.tgz", @@ -10219,14 +14796,14 @@ } }, "node_modules/side-channel-list": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/side-channel-list/-/side-channel-list-1.0.0.tgz", - "integrity": "sha512-FCLHtRD/gnpCiCHEiJLOwdmFP+wzCmDEkc9y7NsYxeF4u7Btsn1ZuwgwJGxImImHicJArLP4R0yX4c2KCrMrTA==", + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/side-channel-list/-/side-channel-list-1.0.1.tgz", + "integrity": "sha512-mjn/0bi/oUURjc5Xl7IaWi/OJJJumuoJFQJfDDyO46+hBWsfaVM65TBHq2eoZBhzl9EchxOijpkbRC8SVBQU0w==", "dev": true, "license": "MIT", "dependencies": { "es-errors": "^1.3.0", - "object-inspect": "^1.13.3" + "object-inspect": "^1.13.4" }, "engines": { "node": ">= 0.4" @@ -10281,6 +14858,29 @@ "dev": true, "license": "ISC" }, + "node_modules/signal-exit": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-4.1.0.tgz", + "integrity": "sha512-bzyZ1e88w9O1iNJbKnOlvYTrWPDl46O1bG0D3XInv+9tkPrxrN8jUUTiFlDkkmKWgn1M6CfIA13SuGqOa9Korw==", + "dev": true, + "license": "ISC", + "engines": { + "node": ">=14" + }, + "funding": { + "url": "https://github.com/sponsors/isaacs" + } + }, + "node_modules/simple-swizzle": { + "version": "0.2.4", + "resolved": "https://registry.npmjs.org/simple-swizzle/-/simple-swizzle-0.2.4.tgz", + "integrity": "sha512-nAu1WFPQSMNr2Zn9PGSZK9AGn4t/y97lEm+MXTtUDwfP0ksAIX4nO+6ruD9Jwut4C49SB1Ws+fbXsm/yScWOHw==", + "dev": true, + "license": "MIT", + "dependencies": { + "is-arrayish": "^0.3.1" + } + }, "node_modules/sirv": { "version": "3.0.2", "resolved": "https://registry.npmjs.org/sirv/-/sirv-3.0.2.tgz", @@ -10296,6 +14896,16 @@ "node": ">=18" } }, + "node_modules/smob": { + "version": "1.6.2", + "resolved": "https://registry.npmjs.org/smob/-/smob-1.6.2.tgz", + "integrity": "sha512-RQsvleCbF8cVHEv+xuDGaA4pOizFqJ0GgjtMSRo6oP8pnN7WsigHgVGey6aILRBKv4W2YOMHLqbKdnB6hpB9fw==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=20.0.0" + } + }, "node_modules/source-map": { "version": "0.6.1", "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz", @@ -10316,6 +14926,17 @@ "node": ">=0.10.0" } }, + "node_modules/source-map-support": { + "version": "0.5.21", + "resolved": "https://registry.npmjs.org/source-map-support/-/source-map-support-0.5.21.tgz", + "integrity": "sha512-uBHU3L3czsIyYXKX88fdrGovxdSCoTGDRZ6SYXtSRxLZUzHg5P/66Ht6uoUlHu9EZod+inXhKo3qQgwXUT/y1w==", + "dev": true, + "license": "MIT", + "dependencies": { + "buffer-from": "^1.0.0", + "source-map": "^0.6.0" + } + }, "node_modules/space-separated-tokens": { "version": "2.0.2", "resolved": "https://registry.npmjs.org/space-separated-tokens/-/space-separated-tokens-2.0.2.tgz", @@ -10351,21 +14972,38 @@ "dev": true, "license": "MIT" }, + "node_modules/stop-iteration-iterator": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/stop-iteration-iterator/-/stop-iteration-iterator-1.1.0.tgz", + "integrity": "sha512-eLoXW/DHyl62zxY4SCaIgnRhuMr6ri4juEYARS8E6sCEqzKpOiE521Ucofdx+KnDZl5xmvGYaaKCk5FEOxJCoQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "es-errors": "^1.3.0", + "internal-slot": "^1.1.0" + }, + "engines": { + "node": ">= 0.4" + } + }, "node_modules/storybook": { - "version": "10.3.3", - "resolved": "https://registry.npmjs.org/storybook/-/storybook-10.3.3.tgz", - "integrity": "sha512-tMoRAts9EVqf+mEMPLC6z1DPyHbcPe+CV1MhLN55IKsl0HxNjvVGK44rVPSePbltPE6vIsn4bdRj6CCUt8SJwQ==", + "version": "10.4.2", + "resolved": "https://registry.npmjs.org/storybook/-/storybook-10.4.2.tgz", + "integrity": "sha512-5Ax5vbHxFgMBGGhQDm75Rrumm/HZC4ICFhMcJaM0UlqnC/4FKj/IaZtImZFupknyiiyUEcWHPQFA2kX3/VSv1A==", "dev": true, "license": "MIT", "dependencies": { "@storybook/global": "^5.0.0", - "@storybook/icons": "^2.0.1", + "@storybook/icons": "^2.0.2", "@testing-library/jest-dom": "^6.9.1", "@testing-library/user-event": "^14.6.1", "@vitest/expect": "3.2.4", "@vitest/spy": "3.2.4", + "@webcontainer/env": "^1.1.1", "esbuild": "^0.18.0 || ^0.19.0 || ^0.20.0 || ^0.21.0 || ^0.22.0 || ^0.23.0 || ^0.24.0 || ^0.25.0 || ^0.26.0 || ^0.27.0", "open": "^10.2.0", + "oxc-parser": "^0.127.0", + "oxc-resolver": "^11.19.1", "recast": "^0.23.5", "semver": "^7.7.3", "use-sync-external-store": "^1.5.0", @@ -10379,14 +15017,122 @@ "url": "https://opencollective.com/storybook" }, "peerDependencies": { - "prettier": "^2 || ^3" + "@types/react": "^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0", + "prettier": "^2 || ^3", + "vite-plus": "^0.1.15" }, "peerDependenciesMeta": { + "@types/react": { + "optional": true + }, "prettier": { "optional": true + }, + "vite-plus": { + "optional": true } } }, + "node_modules/storybook/node_modules/@vitest/spy": { + "version": "3.2.4", + "resolved": "https://registry.npmjs.org/@vitest/spy/-/spy-3.2.4.tgz", + "integrity": "sha512-vAfasCOe6AIK70iP5UD11Ac4siNUNJ9i/9PZ3NKx07sG6sUxeag1LWdNrMWeKKYBLlzuK+Gn65Yd5nyL6ds+nw==", + "dev": true, + "license": "MIT", + "dependencies": { + "tinyspy": "^4.0.3" + }, + "funding": { + "url": "https://opencollective.com/vitest" + } + }, + "node_modules/string.prototype.matchall": { + "version": "4.0.12", + "resolved": "https://registry.npmjs.org/string.prototype.matchall/-/string.prototype.matchall-4.0.12.tgz", + "integrity": "sha512-6CC9uyBL+/48dYizRf7H7VAYCMCNTBeM78x/VTUe9bFEaxBepPJDa1Ow99LqI/1yF7kuy7Q3cQsYMrcjGUcskA==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bind": "^1.0.8", + "call-bound": "^1.0.3", + "define-properties": "^1.2.1", + "es-abstract": "^1.23.6", + "es-errors": "^1.3.0", + "es-object-atoms": "^1.0.0", + "get-intrinsic": "^1.2.6", + "gopd": "^1.2.0", + "has-symbols": "^1.1.0", + "internal-slot": "^1.1.0", + "regexp.prototype.flags": "^1.5.3", + "set-function-name": "^2.0.2", + "side-channel": "^1.1.0" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/string.prototype.trim": { + "version": "1.2.10", + "resolved": "https://registry.npmjs.org/string.prototype.trim/-/string.prototype.trim-1.2.10.tgz", + "integrity": "sha512-Rs66F0P/1kedk5lyYyH9uBzuiI/kNRmwJAR9quK6VOtIpZ2G+hMZd+HQbbv25MgCA6gEffoMZYxlTod4WcdrKA==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bind": "^1.0.8", + "call-bound": "^1.0.2", + "define-data-property": "^1.1.4", + "define-properties": "^1.2.1", + "es-abstract": "^1.23.5", + "es-object-atoms": "^1.0.0", + "has-property-descriptors": "^1.0.2" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/string.prototype.trimend": { + "version": "1.0.9", + "resolved": "https://registry.npmjs.org/string.prototype.trimend/-/string.prototype.trimend-1.0.9.tgz", + "integrity": "sha512-G7Ok5C6E/j4SGfyLCloXTrngQIQU3PWtXGst3yM7Bea9FRURf1S42ZHlZZtsNque2FN2PoUhfZXYLNWwEr4dLQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bind": "^1.0.8", + "call-bound": "^1.0.2", + "define-properties": "^1.2.1", + "es-object-atoms": "^1.0.0" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/string.prototype.trimstart": { + "version": "1.0.8", + "resolved": "https://registry.npmjs.org/string.prototype.trimstart/-/string.prototype.trimstart-1.0.8.tgz", + "integrity": "sha512-UXSH262CSZY1tfu3G3Secr6uGLCFVPMhIqHjlgCUtCCcgihYc/xKs9djMTMUOb2j1mVSeU8EU6NWc/iQKU6Gfg==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bind": "^1.0.7", + "define-properties": "^1.2.1", + "es-object-atoms": "^1.0.0" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, "node_modules/stringify-entities": { "version": "4.0.4", "resolved": "https://registry.npmjs.org/stringify-entities/-/stringify-entities-4.0.4.tgz", @@ -10397,19 +15143,34 @@ "character-entities-html4": "^2.0.0", "character-entities-legacy": "^3.0.0" }, - "funding": { - "type": "github", - "url": "https://github.com/sponsors/wooorm" + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, + "node_modules/stringify-object": { + "version": "3.3.0", + "resolved": "https://registry.npmjs.org/stringify-object/-/stringify-object-3.3.0.tgz", + "integrity": "sha512-rHqiFh1elqCQ9WPLIC8I0Q/g/wj5J1eMkyoiD6eoQApWHP0FtlK7rqnhmabL5VUY9JQCcqwwvlOaSuutekgyrw==", + "dev": true, + "license": "BSD-2-Clause", + "dependencies": { + "get-own-enumerable-property-symbols": "^3.0.0", + "is-obj": "^1.0.1", + "is-regexp": "^1.0.0" + }, + "engines": { + "node": ">=4" } }, "node_modules/strip-ansi": { - "version": "7.1.0", - "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-7.1.0.tgz", - "integrity": "sha512-iq6eVVI64nQQTRYq2KtEg2d2uU7LElhTJwsH4YzIHZshxlgZms/wIc4VoDQTlG/IvVIrBKG06CrZnp0qv7hkcQ==", + "version": "7.2.0", + "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-7.2.0.tgz", + "integrity": "sha512-yDPMNjp4WyfYBkHnjIRLfca1i6KMyGCtsVgoKe/z1+6vukgaENdgGBZt+ZmKPc4gavvEZ5OgHfHdrazhgNyG7w==", "dev": true, "license": "MIT", "dependencies": { - "ansi-regex": "^6.0.1" + "ansi-regex": "^6.2.2" }, "engines": { "node": ">=12" @@ -10419,9 +15180,9 @@ } }, "node_modules/strip-ansi/node_modules/ansi-regex": { - "version": "6.1.0", - "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-6.1.0.tgz", - "integrity": "sha512-7HSX4QQb4CspciLpVFwyRe79O3xsIZDDLER21kERQ71oaPodF8jL725AgJMFAYbooIqolJoRLuM81SpeUkpkvA==", + "version": "6.2.2", + "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-6.2.2.tgz", + "integrity": "sha512-Bq3SmSpyFHaWjPk8If9yc6svM8c56dB5BAtW4Qbw5jHTwwXXcTLoRMkpDJp6VL0XzlWaCHTXrkFURMYmD0sLqg==", "dev": true, "license": "MIT", "engines": { @@ -10431,6 +15192,16 @@ "url": "https://github.com/chalk/ansi-regex?sponsor=1" } }, + "node_modules/strip-comments": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/strip-comments/-/strip-comments-2.0.1.tgz", + "integrity": "sha512-ZprKx+bBLXv067WTCALv8SSz5l2+XhpYCsVtSqlMnkAXMWDq+/ekVbl1ghqP9rUHTzv6sm/DwCOiYutU/yp1fw==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=10" + } + }, "node_modules/strip-indent": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/strip-indent/-/strip-indent-3.0.0.tgz", @@ -10458,13 +15229,13 @@ } }, "node_modules/style-to-object": { - "version": "1.0.9", - "resolved": "https://registry.npmjs.org/style-to-object/-/style-to-object-1.0.9.tgz", - "integrity": "sha512-G4qppLgKu/k6FwRpHiGiKPaPTFcG3g4wNVX/Qsfu+RqQM30E7Tyu/TEgxcL9PNLF5pdRLwQdE3YKKf+KF2Dzlw==", + "version": "1.0.14", + "resolved": "https://registry.npmjs.org/style-to-object/-/style-to-object-1.0.14.tgz", + "integrity": "sha512-LIN7rULI0jBscWQYaSswptyderlarFkjQ+t79nzty8tcIAceVomEVlLzH5VP4Cmsv6MtKhs7qaAiwlcp+Mgaxw==", "dev": true, "license": "MIT", "dependencies": { - "inline-style-parser": "0.2.4" + "inline-style-parser": "0.2.7" } }, "node_modules/stylis": { @@ -10487,16 +15258,29 @@ "node": ">=8" } }, + "node_modules/supports-preserve-symlinks-flag": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/supports-preserve-symlinks-flag/-/supports-preserve-symlinks-flag-1.0.0.tgz", + "integrity": "sha512-ot0WnXS9fgdkgIcePe6RHNk1WA8+muPa6cSjeR3V8K27q9BB1rTE3R1p7Hv0z1ZyAc8s6Vvv8DIyWf681MAt0w==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, "node_modules/svelte": { - "version": "5.55.7", - "resolved": "https://registry.npmjs.org/svelte/-/svelte-5.55.7.tgz", - "integrity": "sha512-ymI5ykLPwIHW839E053FQbI1G+jnRFJEw3Kv5Y4njixVWywQBx+NUFpkkKyk5LIb36Fg9DVXSYpqiGekLD0hyw==", + "version": "5.56.1", + "resolved": "https://registry.npmjs.org/svelte/-/svelte-5.56.1.tgz", + "integrity": "sha512-eArsJmvl3xZVuTYD852PzIEdg2wgDdIZ1NEsIPbzAukHwi284B18No4nK2rCO9AwsWUDza4Cjvmoa4HaojTl5g==", "dev": true, "license": "MIT", "dependencies": { "@jridgewell/remapping": "^2.3.4", "@jridgewell/sourcemap-codec": "^1.5.0", - "@sveltejs/acorn-typescript": "^1.0.5", + "@sveltejs/acorn-typescript": "^1.0.10", "@types/estree": "^1.0.5", "@types/trusted-types": "^2.0.7", "acorn": "^8.12.1", @@ -10505,7 +15289,7 @@ "clsx": "^2.1.1", "devalue": "^5.8.1", "esm-env": "^1.2.1", - "esrap": "^2.2.4", + "esrap": "^2.2.9", "is-reference": "^3.0.3", "locate-character": "^3.0.0", "magic-string": "^0.30.11", @@ -10553,14 +15337,22 @@ "@types/estree": "^1.0.1" } }, + "node_modules/svelte-ast-print/node_modules/zimmerframe": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/zimmerframe/-/zimmerframe-1.1.2.tgz", + "integrity": "sha512-rAbqEGa8ovJy4pyBxZM70hg4pE6gDgaQ0Sl9M3enG3I0d6H4XSAM3GeNGLKnsBpuijUow064sf7ww1nutC5/3w==", + "dev": true, + "license": "MIT" + }, "node_modules/svelte-check": { - "version": "4.3.0", - "resolved": "https://registry.npmjs.org/svelte-check/-/svelte-check-4.3.0.tgz", - "integrity": "sha512-Iz8dFXzBNAM7XlEIsUjUGQhbEE+Pvv9odb9+0+ITTgFWZBGeJRRYqHUUglwe2EkLD5LIsQaAc4IUJyvtKuOO5w==", + "version": "4.6.0", + "resolved": "https://registry.npmjs.org/svelte-check/-/svelte-check-4.6.0.tgz", + "integrity": "sha512-KhVnDFDSid57mmZtHz8gfW8AAGylOZ0vPnOIzVmAL+urzwK8sBYXRss953gD8T0OdgAQ11mdWhE6uadmtOz8TQ==", "dev": true, "license": "MIT", "dependencies": { "@jridgewell/trace-mapping": "^0.3.25", + "@sveltejs/load-config": "0.1.1", "chokidar": "^4.0.1", "fdir": "^6.2.0", "picocolors": "^1.0.0", @@ -10577,10 +15369,40 @@ "typescript": ">=5.0.0" } }, + "node_modules/svelte-check/node_modules/chokidar": { + "version": "4.0.3", + "resolved": "https://registry.npmjs.org/chokidar/-/chokidar-4.0.3.tgz", + "integrity": "sha512-Qgzu8kfBvo+cA4962jnP1KkS6Dop5NS6g7R5LFYJr4b8Ub94PPQXUksCw9PvXoeXPRRddRNC5C1JQUR2SMGtnA==", + "dev": true, + "license": "MIT", + "dependencies": { + "readdirp": "^4.0.1" + }, + "engines": { + "node": ">= 14.16.0" + }, + "funding": { + "url": "https://paulmillr.com/funding/" + } + }, + "node_modules/svelte-check/node_modules/readdirp": { + "version": "4.1.2", + "resolved": "https://registry.npmjs.org/readdirp/-/readdirp-4.1.2.tgz", + "integrity": "sha512-GDhwkLfywWL2s6vEjyhri+eXmfH6j1L7JE27WhqLeYzoh/A3DBaYGEj2H/HFZCn/kMfim73FXxEJTw06WtxQwg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 14.18.0" + }, + "funding": { + "type": "individual", + "url": "https://paulmillr.com/funding/" + } + }, "node_modules/svelte-eslint-parser": { - "version": "1.4.1", - "resolved": "https://registry.npmjs.org/svelte-eslint-parser/-/svelte-eslint-parser-1.4.1.tgz", - "integrity": "sha512-1eqkfQ93goAhjAXxZiu1SaKI9+0/sxp4JIWQwUpsz7ybehRE5L8dNuz7Iry7K22R47p5/+s9EM+38nHV2OlgXA==", + "version": "1.8.0", + "resolved": "https://registry.npmjs.org/svelte-eslint-parser/-/svelte-eslint-parser-1.8.0.tgz", + "integrity": "sha512-mikR1qwIVy3t5WthUoAXkMwxkXvabZP9FJgdx35Ei7EbGWmctva1Pih16Koeor/bdNNq8NXHlwKGS6NkYTawLg==", "dev": true, "license": "MIT", "dependencies": { @@ -10589,11 +15411,12 @@ "espree": "^10.0.0", "postcss": "^8.4.49", "postcss-scss": "^4.0.9", - "postcss-selector-parser": "^7.0.0" + "postcss-selector-parser": "^7.0.0", + "semver": "^7.7.2" }, "engines": { "node": "^18.18.0 || ^20.9.0 || >=21.1.0", - "pnpm": "10.24.0" + "pnpm": "10.34.1" }, "funding": { "url": "https://github.com/sponsors/ota-meshi" @@ -10622,9 +15445,9 @@ } }, "node_modules/svelte-sonner": { - "version": "1.0.5", - "resolved": "https://registry.npmjs.org/svelte-sonner/-/svelte-sonner-1.0.5.tgz", - "integrity": "sha512-9dpGPFqKb/QWudYqGnEz93vuY+NgCEvyNvxoCLMVGw6sDN/3oVeKV1xiEirW2E1N3vJEyj5imSBNOGltQHA7mg==", + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/svelte-sonner/-/svelte-sonner-1.1.1.tgz", + "integrity": "sha512-5cd3p7wa4cq0NsqslMwdlPb7x1JglEZ/GKrLePWNr5bCxR1nagAVrY01FRFrXfUGs41miLt3C327+8XJo5BzZw==", "dev": true, "license": "MIT", "dependencies": { @@ -10652,16 +15475,16 @@ } }, "node_modules/svelte-toolbelt": { - "version": "0.7.1", - "resolved": "https://registry.npmjs.org/svelte-toolbelt/-/svelte-toolbelt-0.7.1.tgz", - "integrity": "sha512-HcBOcR17Vx9bjaOceUvxkY3nGmbBmCBBbuWLLEWO6jtmWH8f/QoWmbyUfQZrpDINH39en1b8mptfPQT9VKQ1xQ==", + "version": "0.10.6", + "resolved": "https://registry.npmjs.org/svelte-toolbelt/-/svelte-toolbelt-0.10.6.tgz", + "integrity": "sha512-YWuX+RE+CnWYx09yseAe4ZVMM7e7GRFZM6OYWpBKOb++s+SQ8RBIMMe+Bs/CznBMc0QPLjr+vDBxTAkozXsFXQ==", "dev": true, "funding": [ "https://github.com/sponsors/huntabyte" ], "dependencies": { "clsx": "^2.1.1", - "runed": "^0.23.2", + "runed": "^0.35.1", "style-to-object": "^1.0.8" }, "engines": { @@ -10669,23 +15492,7 @@ "pnpm": ">=8.7.0" }, "peerDependencies": { - "svelte": "^5.0.0" - } - }, - "node_modules/svelte-toolbelt/node_modules/runed": { - "version": "0.23.4", - "resolved": "https://registry.npmjs.org/runed/-/runed-0.23.4.tgz", - "integrity": "sha512-9q8oUiBYeXIDLWNK5DfCWlkL0EW3oGbk845VdKlPeia28l751VpfesaB/+7pI6rnbx1I6rqoZ2fZxptOJLxILA==", - "dev": true, - "funding": [ - "https://github.com/sponsors/huntabyte", - "https://github.com/sponsors/tglide" - ], - "dependencies": { - "esm-env": "^1.0.0" - }, - "peerDependencies": { - "svelte": "^5.7.0" + "svelte": "^5.30.2" } }, "node_modules/svelte/node_modules/aria-query": { @@ -10699,30 +15506,27 @@ } }, "node_modules/svelte/node_modules/esrap": { - "version": "2.2.4", - "resolved": "https://registry.npmjs.org/esrap/-/esrap-2.2.4.tgz", - "integrity": "sha512-suICpxAmZ9A8bzJjEl/+rLJiDKC0X4gYWUxT6URAWBLvlXmtbZd5ySMu/N2ZGEtMCAmflUDPSehrP9BQcsGcSg==", + "version": "2.2.11", + "resolved": "https://registry.npmjs.org/esrap/-/esrap-2.2.11.tgz", + "integrity": "sha512-gPdx+I+BjYEinNMQaBXFjbaJVyoPMU4ZODg5mE+M4DqVG9VusAVHHjcBX+zqyITlI0DIARwDMMzZwAWj36dRoQ==", "dev": true, "license": "MIT", "dependencies": { - "@jridgewell/sourcemap-codec": "^1.4.15", + "@jridgewell/sourcemap-codec": "^1.4.15" + }, + "peerDependencies": { "@typescript-eslint/types": "^8.2.0" - } - }, - "node_modules/svelte/node_modules/is-reference": { - "version": "3.0.3", - "resolved": "https://registry.npmjs.org/is-reference/-/is-reference-3.0.3.tgz", - "integrity": "sha512-ixkJoqQvAP88E6wLydLGGqCJsrFUnqoH6HnaczB8XmDH1oaWU+xxdptvikTgaEhtZ53Ky6YXiBuUI2WXLMCwjw==", - "dev": true, - "license": "MIT", - "dependencies": { - "@types/estree": "^1.0.6" + }, + "peerDependenciesMeta": { + "@typescript-eslint/types": { + "optional": true + } } }, "node_modules/svelte2tsx": { - "version": "0.7.47", - "resolved": "https://registry.npmjs.org/svelte2tsx/-/svelte2tsx-0.7.47.tgz", - "integrity": "sha512-1aw/MFKVPM96OBevJdC12do2an9t5Zwr3Va9amLgTLpJje36ibD1iIHpuqCYWUrdR9vw6g6btKGQPmsqE8ZYCw==", + "version": "0.7.56", + "resolved": "https://registry.npmjs.org/svelte2tsx/-/svelte2tsx-0.7.56.tgz", + "integrity": "sha512-NTvqqL+goYlW8gWNajk81L07+uu7jw5V2m1Az5MZbYm3GEydcHXh+uTrLHM9SuGuaqCtF90vlMXkOVBotfH94g==", "dev": true, "license": "MIT", "dependencies": { @@ -10731,20 +15535,20 @@ }, "peerDependencies": { "svelte": "^3.55 || ^4.0.0-next.0 || ^4.0 || ^5.0.0-next.0", - "typescript": "^4.9.4 || ^5.0.0" + "typescript": "^4.9.4 || ^5.0.0 || ^6.0.0" } }, "node_modules/tabbable": { - "version": "6.2.0", - "resolved": "https://registry.npmjs.org/tabbable/-/tabbable-6.2.0.tgz", - "integrity": "sha512-Cat63mxsVJlzYvN51JmVXIgNoUokrIaT2zLclCXjRd8boZ0004U4KCs/sToJ75C6sdlByWxpYnb5Boif1VSFew==", + "version": "6.4.0", + "resolved": "https://registry.npmjs.org/tabbable/-/tabbable-6.4.0.tgz", + "integrity": "sha512-05PUHKSNE8ou2dwIxTngl4EzcnsCDZGJ/iCLtDflR/SHB/ny14rXc+qU5P4mG9JkusiV7EivzY9Mhm55AzAvCg==", "dev": true, "license": "MIT" }, "node_modules/tailwind-merge": { - "version": "3.3.1", - "resolved": "https://registry.npmjs.org/tailwind-merge/-/tailwind-merge-3.3.1.tgz", - "integrity": "sha512-gBXpgUm/3rp1lMZZrM/w7D8GKqshif0zAymAhbCyIt8KMe+0v9DQ7cdYLR4FHH/cKpdTXb+A/tKKU3eolfsI+g==", + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/tailwind-merge/-/tailwind-merge-3.6.0.tgz", + "integrity": "sha512-uxL7qAVQriqRQPAyK3pj66VqskWqoZ37PW94jwOTwNfq/z9oyu1V+eqrZqtR2+fCiXdYOZe/Modt8GtvqNzu+w==", "dev": true, "license": "MIT", "funding": { @@ -10773,26 +15577,30 @@ } }, "node_modules/tailwindcss": { - "version": "4.1.11", - "resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-4.1.11.tgz", - "integrity": "sha512-2E9TBm6MDD/xKYe+dvJZAmg3yxIEDNRc0jwlNyDg/4Fil2QcSLjFKGVff0lAf1jjeaArlG/M75Ey/EYr/OJtBA==", + "version": "4.3.0", + "resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-4.3.0.tgz", + "integrity": "sha512-y6nxMGB1nMW9R6k96e5gdIFzcfL/gTJRNaqGes1YvkLnPVXzWgbqFF2yLC0T8G774n24cx3Pe8XrKoniCOAH+Q==", "dev": true, "license": "MIT" }, "node_modules/tapable": { - "version": "2.2.2", - "resolved": "https://registry.npmjs.org/tapable/-/tapable-2.2.2.tgz", - "integrity": "sha512-Re10+NauLTMCudc7T5WLFLAwDhQ0JWdrMK+9B2M8zR5hRExKmsRDCBA7/aV/pNJFltmBFO5BAMlQFi/vq3nKOg==", + "version": "2.3.3", + "resolved": "https://registry.npmjs.org/tapable/-/tapable-2.3.3.tgz", + "integrity": "sha512-uxc/zpqFg6x7C8vOE7lh6Lbda8eEL9zmVm/PLeTPBRhh1xCgdWaQ+J1CUieGpIfm2HdtsUpRv+HshiasBMcc6A==", "dev": true, "license": "MIT", "engines": { "node": ">=6" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/webpack" } }, "node_modules/tar": { - "version": "7.5.13", - "resolved": "https://registry.npmjs.org/tar/-/tar-7.5.13.tgz", - "integrity": "sha512-tOG/7GyXpFevhXVh8jOPJrmtRpOTsYqUIkVdVooZYJS/z8WhfQUX8RJILmeuJNinGAMSu1veBr4asSHFt5/hng==", + "version": "7.5.16", + "resolved": "https://registry.npmjs.org/tar/-/tar-7.5.16.tgz", + "integrity": "sha512-56adEpPMouktRlBLXiaYFFzZ/3+JXa8P9n7WbR+ibIjtviN55mEaOkiysCnPnWm+7kkui1Dn8J9l+g6zV8731w==", "dev": true, "license": "BlueOak-1.0.0", "dependencies": { @@ -10806,6 +15614,84 @@ "node": ">=18" } }, + "node_modules/tar/node_modules/yallist": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/yallist/-/yallist-5.0.0.tgz", + "integrity": "sha512-YgvUTfwqyc7UXVMrB+SImsVYSmTS8X/tSrtdNZMImM+n7+QTriRXyXim0mBrTXNeqzVF0KWGgHPeiyViFFrNDw==", + "dev": true, + "license": "BlueOak-1.0.0", + "engines": { + "node": ">=18" + } + }, + "node_modules/temp-dir": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/temp-dir/-/temp-dir-2.0.0.tgz", + "integrity": "sha512-aoBAniQmmwtcKp/7BzsH8Cxzv8OL736p7v1ihGb5e9DJ9kTwGWHrQrVB5+lfVDzfGrdRzXch+ig7LHaY1JTOrg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=8" + } + }, + "node_modules/tempy": { + "version": "0.6.0", + "resolved": "https://registry.npmjs.org/tempy/-/tempy-0.6.0.tgz", + "integrity": "sha512-G13vtMYPT/J8A4X2SjdtBTphZlrp1gKv6hZiOjw14RCWg6GbHuQBGtjlx75xLbYV/wEc0D7G5K4rxKP/cXk8Bw==", + "dev": true, + "license": "MIT", + "dependencies": { + "is-stream": "^2.0.0", + "temp-dir": "^2.0.0", + "type-fest": "^0.16.0", + "unique-string": "^2.0.0" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/tempy/node_modules/type-fest": { + "version": "0.16.0", + "resolved": "https://registry.npmjs.org/type-fest/-/type-fest-0.16.0.tgz", + "integrity": "sha512-eaBzG6MxNzEn9kiwvtre90cXaNLkmadMWa1zQMs3XORCXNbsH/OewwbxC5ia9dCxIxnTAsSxXJaa/p5y8DlvJg==", + "dev": true, + "license": "(MIT OR CC0-1.0)", + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/terser": { + "version": "5.48.0", + "resolved": "https://registry.npmjs.org/terser/-/terser-5.48.0.tgz", + "integrity": "sha512-J/9An6vs9Us6wKRriSFXBWdRZapREHqFzdNUKk0pmu804EMR6dr6winwo7e5JDxN4xahxQsuysyYFwlwj4XN/Q==", + "dev": true, + "license": "BSD-2-Clause", + "dependencies": { + "@jridgewell/source-map": "^0.3.3", + "acorn": "^8.15.0", + "commander": "^2.20.0", + "source-map-support": "~0.5.20" + }, + "bin": { + "terser": "bin/terser" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/terser/node_modules/commander": { + "version": "2.20.3", + "resolved": "https://registry.npmjs.org/commander/-/commander-2.20.3.tgz", + "integrity": "sha512-GpVkmM8vF2vQUkj2LvZmD35JxeJOLCwJ9cUkugyk2nuhbv3+mJvpLYYt+0+USMxE+oj+ey/lJEnhZw75x/OMcQ==", + "dev": true, + "license": "MIT" + }, "node_modules/tiny-invariant": { "version": "1.3.3", "resolved": "https://registry.npmjs.org/tiny-invariant/-/tiny-invariant-1.3.3.tgz", @@ -10831,14 +15717,14 @@ } }, "node_modules/tinyglobby": { - "version": "0.2.15", - "resolved": "https://registry.npmjs.org/tinyglobby/-/tinyglobby-0.2.15.tgz", - "integrity": "sha512-j2Zq4NyQYG5XMST4cbs02Ak8iJUdxRM0XI5QyxXuZOzKOINmWurp3smXu3y5wDcJrptwpSjgXHzIQxR0omXljQ==", + "version": "0.2.17", + "resolved": "https://registry.npmjs.org/tinyglobby/-/tinyglobby-0.2.17.tgz", + "integrity": "sha512-wXR/dYpcqKmfWpEdZjiKJOwCNFndD0DMnrW/cYjVGttEkBfVgcLFHoNrlj47mjOVic9yyNu65alsgF4NQyTa2g==", "dev": true, "license": "MIT", "dependencies": { "fdir": "^6.5.0", - "picomatch": "^4.0.3" + "picomatch": "^4.0.4" }, "engines": { "node": ">=12.0.0" @@ -10858,28 +15744,21 @@ } }, "node_modules/tinyspy": { - "version": "4.0.3", - "resolved": "https://registry.npmjs.org/tinyspy/-/tinyspy-4.0.3.tgz", - "integrity": "sha512-t2T/WLB2WRgZ9EpE4jgPJ9w+i66UZfDc8wHh0xrwiRNN+UwH98GIJkTeZqX9rg0i0ptwzqW+uYeIF0T4F8LR7A==", + "version": "4.0.4", + "resolved": "https://registry.npmjs.org/tinyspy/-/tinyspy-4.0.4.tgz", + "integrity": "sha512-azl+t0z7pw/z958Gy9svOTuzqIk6xq+NSheJzn5MMWtWTFywIacg2wUlzKFGtt3cthx0r2SxMK0yzJOR0IES7Q==", "dev": true, "license": "MIT", "engines": { "node": ">=14.0.0" } }, - "node_modules/to-regex-range": { - "version": "5.0.1", - "resolved": "https://registry.npmjs.org/to-regex-range/-/to-regex-range-5.0.1.tgz", - "integrity": "sha512-65P7iz6X5yEr1cwcgvQxbbIw7Uk3gOy5dIdtZ4rDveLqhrdJP+Li/Hx6tyK0NEb+2GCyneCMJiGqrADCSNk8sQ==", + "node_modules/to-data-view": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/to-data-view/-/to-data-view-1.1.0.tgz", + "integrity": "sha512-1eAdufMg6mwgmlojAx3QeMnzB/BTVp7Tbndi3U7ftcT2zCZadjxkkmLmd97zmaxWi+sgGcgWrokmpEoy0Dn0vQ==", "dev": true, - "license": "MIT", - "optional": true, - "dependencies": { - "is-number": "^7.0.0" - }, - "engines": { - "node": ">=8.0" - } + "license": "MIT" }, "node_modules/toidentifier": { "version": "1.0.1", @@ -10901,6 +15780,16 @@ "node": ">=6" } }, + "node_modules/tr46": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/tr46/-/tr46-1.0.1.tgz", + "integrity": "sha512-dTpowEjclQ7Kgx5SdBkqRzVhERQXov8/l9Ft9dVM9fmg0W0KQSVaXX9T4i6twCPNtYiZM53lpSSUAwJbFPOHxA==", + "dev": true, + "license": "MIT", + "dependencies": { + "punycode": "^2.1.0" + } + }, "node_modules/trim-lines": { "version": "3.0.1", "resolved": "https://registry.npmjs.org/trim-lines/-/trim-lines-3.0.1.tgz", @@ -10924,9 +15813,9 @@ } }, "node_modules/ts-api-utils": { - "version": "2.4.0", - "resolved": "https://registry.npmjs.org/ts-api-utils/-/ts-api-utils-2.4.0.tgz", - "integrity": "sha512-3TaVTaAv2gTiMB35i3FiGJaRfwb3Pyn/j3m/bfAvGe8FB7CF6u+LMYqYlDh7reQf7UNvoTvdfAqHGmPGOSsPmA==", + "version": "2.5.0", + "resolved": "https://registry.npmjs.org/ts-api-utils/-/ts-api-utils-2.5.0.tgz", + "integrity": "sha512-OJ/ibxhPlqrMM0UiNHJ/0CKQkoKF243/AEmplt3qpRgkW8VG7IfOS41h7V8TjITqdByHzrjcS/2si+y4lIh8NA==", "dev": true, "license": "MIT", "engines": { @@ -10946,68 +15835,164 @@ "node": ">=6.10" } }, - "node_modules/tslib": { - "version": "2.8.1", - "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.8.1.tgz", - "integrity": "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==", - "dev": true, - "license": "0BSD" - }, - "node_modules/tw-animate-css": { - "version": "1.3.5", - "resolved": "https://registry.npmjs.org/tw-animate-css/-/tw-animate-css-1.3.5.tgz", - "integrity": "sha512-t3u+0YNoloIhj1mMXs779P6MO9q3p3mvGn4k1n3nJPqJw/glZcuijG2qTSN4z4mgNRfW5ZC3aXJFLwDtiipZXA==", + "node_modules/tslib": { + "version": "2.8.1", + "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.8.1.tgz", + "integrity": "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==", + "dev": true, + "license": "0BSD" + }, + "node_modules/tw-animate-css": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/tw-animate-css/-/tw-animate-css-1.4.0.tgz", + "integrity": "sha512-7bziOlRqH0hJx80h/3mbicLW7o8qLsH5+RaLR2t+OHM3D0JlWGODQKQ4cxbK7WlvmUxpcj6Kgu6EKqjrGFe3QQ==", + "dev": true, + "license": "MIT", + "funding": { + "url": "https://github.com/sponsors/Wombosvideo" + } + }, + "node_modules/type-check": { + "version": "0.4.0", + "resolved": "https://registry.npmjs.org/type-check/-/type-check-0.4.0.tgz", + "integrity": "sha512-XleUoc9uwGXqjWwXaUTZAmzMcFZ5858QA2vvx1Ur5xIcixXIP+8LnFDgRplU30us6teqdlskFfu+ae4K79Ooew==", + "dev": true, + "license": "MIT", + "dependencies": { + "prelude-ls": "^1.2.1" + }, + "engines": { + "node": ">= 0.8.0" + } + }, + "node_modules/type-fest": { + "version": "2.19.0", + "resolved": "https://registry.npmjs.org/type-fest/-/type-fest-2.19.0.tgz", + "integrity": "sha512-RAH822pAdBgcNMAfWnCBU3CFZcfZ/i1eZjwFU/dsLKumyuuP3niueg2UAukXYF0E2AAoc82ZSSf9J0WQBinzHA==", + "dev": true, + "license": "(MIT OR CC0-1.0)", + "engines": { + "node": ">=12.20" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/type-is": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/type-is/-/type-is-2.1.0.tgz", + "integrity": "sha512-faYHw0anBbc/kWF3zFTEnxSFOAGUX9GFbOBthvDdLsIlEoWOFOtS0zgCiQYwIskL9iGXZL3kAXD8OoZ4GmMATA==", + "dev": true, + "license": "MIT", + "dependencies": { + "content-type": "^2.0.0", + "media-typer": "^1.1.0", + "mime-types": "^3.0.0" + }, + "engines": { + "node": ">= 18" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/express" + } + }, + "node_modules/type-is/node_modules/content-type": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/content-type/-/content-type-2.0.0.tgz", + "integrity": "sha512-j/O/d7GcZCyNl7/hwZAb606rzqkyvaDctLmckbxLzHvFBzTJHuGEdodATcP3yIRoDrLHkIATJuvzbFlp/ki2cQ==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=18" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/express" + } + }, + "node_modules/typed-array-buffer": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/typed-array-buffer/-/typed-array-buffer-1.0.3.tgz", + "integrity": "sha512-nAYYwfY3qnzX30IkA6AQZjVbtK6duGontcQm1WSG1MD94YLqK0515GNApXkoxKOWMusVssAHWLh9SeaoefYFGw==", "dev": true, "license": "MIT", - "funding": { - "url": "https://github.com/sponsors/Wombosvideo" + "dependencies": { + "call-bound": "^1.0.3", + "es-errors": "^1.3.0", + "is-typed-array": "^1.1.14" + }, + "engines": { + "node": ">= 0.4" } }, - "node_modules/type-check": { - "version": "0.4.0", - "resolved": "https://registry.npmjs.org/type-check/-/type-check-0.4.0.tgz", - "integrity": "sha512-XleUoc9uwGXqjWwXaUTZAmzMcFZ5858QA2vvx1Ur5xIcixXIP+8LnFDgRplU30us6teqdlskFfu+ae4K79Ooew==", + "node_modules/typed-array-byte-length": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/typed-array-byte-length/-/typed-array-byte-length-1.0.3.tgz", + "integrity": "sha512-BaXgOuIxz8n8pIq3e7Atg/7s+DpiYrxn4vdot3w9KbnBhcRQq6o3xemQdIfynqSeXeDrF32x+WvfzmOjPiY9lg==", "dev": true, "license": "MIT", "dependencies": { - "prelude-ls": "^1.2.1" + "call-bind": "^1.0.8", + "for-each": "^0.3.3", + "gopd": "^1.2.0", + "has-proto": "^1.2.0", + "is-typed-array": "^1.1.14" }, "engines": { - "node": ">= 0.8.0" + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/type-fest": { - "version": "2.19.0", - "resolved": "https://registry.npmjs.org/type-fest/-/type-fest-2.19.0.tgz", - "integrity": "sha512-RAH822pAdBgcNMAfWnCBU3CFZcfZ/i1eZjwFU/dsLKumyuuP3niueg2UAukXYF0E2AAoc82ZSSf9J0WQBinzHA==", + "node_modules/typed-array-byte-offset": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/typed-array-byte-offset/-/typed-array-byte-offset-1.0.4.tgz", + "integrity": "sha512-bTlAFB/FBYMcuX81gbL4OcpH5PmlFHqlCCpAl8AlEzMz5k53oNDvN8p1PNOWLEmI2x4orp3raOFB51tv9X+MFQ==", "dev": true, - "license": "(MIT OR CC0-1.0)", + "license": "MIT", + "dependencies": { + "available-typed-arrays": "^1.0.7", + "call-bind": "^1.0.8", + "for-each": "^0.3.3", + "gopd": "^1.2.0", + "has-proto": "^1.2.0", + "is-typed-array": "^1.1.15", + "reflect.getprototypeof": "^1.0.9" + }, "engines": { - "node": ">=12.20" + "node": ">= 0.4" }, "funding": { - "url": "https://github.com/sponsors/sindresorhus" + "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/type-is": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/type-is/-/type-is-2.0.1.tgz", - "integrity": "sha512-OZs6gsjF4vMp32qrCbiVSkrFmXtG/AZhY3t0iAMrMBiAZyV9oALtXO8hsrHbMXF9x6L3grlFuwW2oAz7cav+Gw==", + "node_modules/typed-array-length": { + "version": "1.0.8", + "resolved": "https://registry.npmjs.org/typed-array-length/-/typed-array-length-1.0.8.tgz", + "integrity": "sha512-phPGCwqr2+Qo0fwniCE8e4pKnGu/yFb5nD5Y8bf0EEeiI5GklnACYA9GFy/DrAeRrKHXvHn+1SUsOWgJp6RO+g==", "dev": true, "license": "MIT", "dependencies": { - "content-type": "^1.0.5", - "media-typer": "^1.1.0", - "mime-types": "^3.0.0" + "call-bind": "^1.0.9", + "for-each": "^0.3.5", + "gopd": "^1.2.0", + "is-typed-array": "^1.1.15", + "possible-typed-array-names": "^1.1.0", + "reflect.getprototypeof": "^1.0.10" }, "engines": { - "node": ">= 0.6" + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" } }, "node_modules/typescript": { - "version": "5.8.3", - "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.8.3.tgz", - "integrity": "sha512-p1diW6TqL9L07nNxvRMM7hMMw4c5XOo/1ibL4aAIGmSAt9slTE1Xgw5KWuof2uTOvCg9BY7ZRi+GaF+7sfgPeQ==", + "version": "5.9.3", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.9.3.tgz", + "integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==", "dev": true, "license": "Apache-2.0", "bin": { @@ -11019,16 +16004,16 @@ } }, "node_modules/typescript-eslint": { - "version": "8.56.0", - "resolved": "https://registry.npmjs.org/typescript-eslint/-/typescript-eslint-8.56.0.tgz", - "integrity": "sha512-c7toRLrotJ9oixgdW7liukZpsnq5CZ7PuKztubGYlNppuTqhIoWfhgHo/7EU0v06gS2l/x0i2NEFK1qMIf0rIg==", + "version": "8.60.1", + "resolved": "https://registry.npmjs.org/typescript-eslint/-/typescript-eslint-8.60.1.tgz", + "integrity": "sha512-6m5hkkRAp8lKvhVpcprAIn5KkehQEh+47oHH2VGnExEh7dhNxXlg6GPAOIu6TxbVQxhebrJDvjl3020ooiWCMA==", "dev": true, "license": "MIT", "dependencies": { - "@typescript-eslint/eslint-plugin": "8.56.0", - "@typescript-eslint/parser": "8.56.0", - "@typescript-eslint/typescript-estree": "8.56.0", - "@typescript-eslint/utils": "8.56.0" + "@typescript-eslint/eslint-plugin": "8.60.1", + "@typescript-eslint/parser": "8.60.1", + "@typescript-eslint/typescript-estree": "8.60.1", + "@typescript-eslint/utils": "8.60.1" }, "engines": { "node": "^18.18.0 || ^20.9.0 || >=21.1.0" @@ -11039,16 +16024,110 @@ }, "peerDependencies": { "eslint": "^8.57.0 || ^9.0.0 || ^10.0.0", - "typescript": ">=4.8.4 <6.0.0" + "typescript": ">=4.8.4 <6.1.0" + } + }, + "node_modules/unbox-primitive": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/unbox-primitive/-/unbox-primitive-1.1.0.tgz", + "integrity": "sha512-nWJ91DjeOkej/TA8pXQ3myruKpKEYgqvpw9lz4OPHj/NWFNluYrjbz9j01CJ8yKQd2g4jFoOkINCTW2I5LEEyw==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bound": "^1.0.3", + "has-bigints": "^1.0.2", + "has-symbols": "^1.1.0", + "which-boxed-primitive": "^1.1.1" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/unconfig": { + "version": "7.5.0", + "resolved": "https://registry.npmjs.org/unconfig/-/unconfig-7.5.0.tgz", + "integrity": "sha512-oi8Qy2JV4D3UQ0PsopR28CzdQ3S/5A1zwsUwp/rosSbfhJ5z7b90bIyTwi/F7hCLD4SGcZVjDzd4XoUQcEanvA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@quansync/fs": "^1.0.0", + "defu": "^6.1.4", + "jiti": "^2.6.1", + "quansync": "^1.0.0", + "unconfig-core": "7.5.0" + }, + "funding": { + "url": "https://github.com/sponsors/antfu" + } + }, + "node_modules/unconfig-core": { + "version": "7.5.0", + "resolved": "https://registry.npmjs.org/unconfig-core/-/unconfig-core-7.5.0.tgz", + "integrity": "sha512-Su3FauozOGP44ZmKdHy2oE6LPjk51M/TRRjHv2HNCWiDvfvCoxC2lno6jevMA91MYAdCdwP05QnWdWpSbncX/w==", + "dev": true, + "license": "MIT", + "dependencies": { + "@quansync/fs": "^1.0.0", + "quansync": "^1.0.0" + }, + "funding": { + "url": "https://github.com/sponsors/antfu" } }, "node_modules/undici-types": { - "version": "7.16.0", - "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.16.0.tgz", - "integrity": "sha512-Zz+aZWSj8LE6zoxD+xrjh4VfkIG8Ya6LvYkZqtUQGJPZjYl53ypCaUwWqo7eI0x66KBGeRo+mlBEkMSeSZ38Nw==", + "version": "7.18.2", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.18.2.tgz", + "integrity": "sha512-AsuCzffGHJybSaRrmr5eHr81mwJU3kjw6M+uprWvCXiNeN9SOGwQ3Jn8jb8m3Z6izVgknn1R0FTCEAP2QrLY/w==", "dev": true, "license": "MIT" }, + "node_modules/unicode-canonical-property-names-ecmascript": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/unicode-canonical-property-names-ecmascript/-/unicode-canonical-property-names-ecmascript-2.0.1.tgz", + "integrity": "sha512-dA8WbNeb2a6oQzAQ55YlT5vQAWGV9WXOsi3SskE3bcCdM0P4SDd+24zS/OCacdRq5BkdsRj9q3Pg6YyQoxIGqg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=4" + } + }, + "node_modules/unicode-match-property-ecmascript": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/unicode-match-property-ecmascript/-/unicode-match-property-ecmascript-2.0.0.tgz", + "integrity": "sha512-5kaZCrbp5mmbz5ulBkDkbY0SsPOjKqVS35VpL9ulMPfSl0J0Xsm+9Evphv9CoIZFwre7aJoa94AY6seMKGVN5Q==", + "dev": true, + "license": "MIT", + "dependencies": { + "unicode-canonical-property-names-ecmascript": "^2.0.0", + "unicode-property-aliases-ecmascript": "^2.0.0" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/unicode-match-property-value-ecmascript": { + "version": "2.2.1", + "resolved": "https://registry.npmjs.org/unicode-match-property-value-ecmascript/-/unicode-match-property-value-ecmascript-2.2.1.tgz", + "integrity": "sha512-JQ84qTuMg4nVkx8ga4A16a1epI9H6uTXAknqxkGF/aFfRLw1xC/Bp24HNLaZhHSkWd3+84t8iXnp1J0kYcZHhg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=4" + } + }, + "node_modules/unicode-property-aliases-ecmascript": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/unicode-property-aliases-ecmascript/-/unicode-property-aliases-ecmascript-2.2.0.tgz", + "integrity": "sha512-hpbDzxUY9BFwX+UeBnxv3Sh1q7HFxj48DTmXchNgRa46lO8uj3/1iEn3MiNUYTg1g9ctIqXCCERn8gYZhHC5lQ==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=4" + } + }, "node_modules/unified": { "version": "11.0.5", "resolved": "https://registry.npmjs.org/unified/-/unified-11.0.5.tgz", @@ -11088,6 +16167,19 @@ "node": ">= 0.8.0" } }, + "node_modules/unique-string": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/unique-string/-/unique-string-2.0.0.tgz", + "integrity": "sha512-uNaeirEPvpZWSgzwsPGtU2zVSTrn/8L5q/IexZmH0eH6SA73CmAA5U4GwORTxQAZs95TAXLNqeLoPPNO5gZfWg==", + "dev": true, + "license": "MIT", + "dependencies": { + "crypto-random-string": "^2.0.0" + }, + "engines": { + "node": ">=8" + } + }, "node_modules/unist-util-find-after": { "version": "5.0.0", "resolved": "https://registry.npmjs.org/unist-util-find-after/-/unist-util-find-after-5.0.0.tgz", @@ -11111,9 +16203,9 @@ "license": "MIT" }, "node_modules/unist-util-is": { - "version": "6.0.0", - "resolved": "https://registry.npmjs.org/unist-util-is/-/unist-util-is-6.0.0.tgz", - "integrity": "sha512-2qCTHimwdxLfz+YzdGfkqNlH0tLi9xjTnHddPmJwtIG9MGsdbutfTc4P+haPD7l7Cjxf/WZj+we5qfVPvvxfYw==", + "version": "6.0.1", + "resolved": "https://registry.npmjs.org/unist-util-is/-/unist-util-is-6.0.1.tgz", + "integrity": "sha512-LsiILbtBETkDz8I9p1dQ0uyRUWuaQzd/cuEeS1hoRSyW5E5XGmTzlwY1OrNzzakGowI9Dr/I8HVaw4hTtnxy8g==", "dev": true, "license": "MIT", "dependencies": { @@ -11175,23 +16267,30 @@ "license": "MIT" }, "node_modules/unist-util-stringify-position": { - "version": "2.0.3", - "resolved": "https://registry.npmjs.org/unist-util-stringify-position/-/unist-util-stringify-position-2.0.3.tgz", - "integrity": "sha512-3faScn5I+hy9VleOq/qNbAd6pAx7iH5jYBMS9I1HgQVijz/4mv5Bvw5iw1sC/90CODiKo81G/ps8AJrISn687g==", + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/unist-util-stringify-position/-/unist-util-stringify-position-4.0.0.tgz", + "integrity": "sha512-0ASV06AAoKCDkS2+xw5RXJywruurpbC4JZSm7nr7MOt1ojAzvyyaO+UxZf18j8FCF6kmzCZKcAgN/yu2gm2XgQ==", "dev": true, "license": "MIT", "dependencies": { - "@types/unist": "^2.0.2" + "@types/unist": "^3.0.0" }, "funding": { "type": "opencollective", "url": "https://opencollective.com/unified" } }, + "node_modules/unist-util-stringify-position/node_modules/@types/unist": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.3.tgz", + "integrity": "sha512-ko/gIFJRv177XgZsZcBwnqJN5x/Gien8qNOn0D5bQU/zAzVf9Zt3BlcUiLqhV9y4ARk0GbT3tnUiPNgnTXzc/Q==", + "dev": true, + "license": "MIT" + }, "node_modules/unist-util-visit": { - "version": "5.0.0", - "resolved": "https://registry.npmjs.org/unist-util-visit/-/unist-util-visit-5.0.0.tgz", - "integrity": "sha512-MR04uvD+07cwl/yhVuVWAtw+3GOR/knlL55Nd/wAdblk27GCVt3lqpTivy/tkJcZoNPzTwS1Y+KMojlLDhoTzg==", + "version": "5.1.0", + "resolved": "https://registry.npmjs.org/unist-util-visit/-/unist-util-visit-5.1.0.tgz", + "integrity": "sha512-m+vIdyeCOpdr/QeQCu2EzxX/ohgS8KbnPDgFni4dQsfSCtpz8UqDyY5GjRru8PDKuYn7Fq19j1CQ+nJSsGKOzg==", "dev": true, "license": "MIT", "dependencies": { @@ -11205,9 +16304,9 @@ } }, "node_modules/unist-util-visit-parents": { - "version": "6.0.1", - "resolved": "https://registry.npmjs.org/unist-util-visit-parents/-/unist-util-visit-parents-6.0.1.tgz", - "integrity": "sha512-L/PqWzfTP9lzzEa6CKs0k2nARxTdZduw3zyh8d2NVBnsyvHjSX4TWse388YrrQKbvI8w20fGjGlhgT96WwKykw==", + "version": "6.0.2", + "resolved": "https://registry.npmjs.org/unist-util-visit-parents/-/unist-util-visit-parents-6.0.2.tgz", + "integrity": "sha512-goh1s1TBrqSqukSc8wrjwWhL0hiJxgA8m4kFxGlQ+8FYQ3C/m11FcTs4YYem7V664AhHVvgoQLk890Ssdsr2IQ==", "dev": true, "license": "MIT", "dependencies": { @@ -11269,6 +16368,48 @@ "node": ">=18.12.0" } }, + "node_modules/upath": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/upath/-/upath-1.2.0.tgz", + "integrity": "sha512-aZwGpamFO61g3OlfT7OQCHqhGnW43ieH9WZeP7QxN/G/jS4jfqUkZxoryvJgVPEcrl5NL/ggHsSmLMHuH64Lhg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=4", + "yarn": "*" + } + }, + "node_modules/update-browserslist-db": { + "version": "1.2.3", + "resolved": "https://registry.npmjs.org/update-browserslist-db/-/update-browserslist-db-1.2.3.tgz", + "integrity": "sha512-Js0m9cx+qOgDxo0eMiFGEueWztz+d4+M3rGlmKPT+T4IS/jP4ylw3Nwpu6cpTTP8R1MAC1kF4VbdLt3ARf209w==", + "dev": true, + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/browserslist" + }, + { + "type": "tidelift", + "url": "https://tidelift.com/funding/github/npm/browserslist" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "license": "MIT", + "dependencies": { + "escalade": "^3.2.0", + "picocolors": "^1.1.1" + }, + "bin": { + "update-browserslist-db": "cli.js" + }, + "peerDependencies": { + "browserslist": ">= 4.21.0" + } + }, "node_modules/uri-js": { "version": "4.4.1", "resolved": "https://registry.npmjs.org/uri-js/-/uri-js-4.4.1.tgz", @@ -11379,31 +16520,31 @@ "url": "https://opencollective.com/unified" } }, - "node_modules/vfile/node_modules/@types/unist": { - "version": "3.0.3", - "resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.3.tgz", - "integrity": "sha512-ko/gIFJRv177XgZsZcBwnqJN5x/Gien8qNOn0D5bQU/zAzVf9Zt3BlcUiLqhV9y4ARk0GbT3tnUiPNgnTXzc/Q==", - "dev": true, - "license": "MIT" - }, - "node_modules/vfile/node_modules/unist-util-stringify-position": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/unist-util-stringify-position/-/unist-util-stringify-position-4.0.0.tgz", - "integrity": "sha512-0ASV06AAoKCDkS2+xw5RXJywruurpbC4JZSm7nr7MOt1ojAzvyyaO+UxZf18j8FCF6kmzCZKcAgN/yu2gm2XgQ==", + "node_modules/vfile-message/node_modules/unist-util-stringify-position": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/unist-util-stringify-position/-/unist-util-stringify-position-2.0.3.tgz", + "integrity": "sha512-3faScn5I+hy9VleOq/qNbAd6pAx7iH5jYBMS9I1HgQVijz/4mv5Bvw5iw1sC/90CODiKo81G/ps8AJrISn687g==", "dev": true, "license": "MIT", "dependencies": { - "@types/unist": "^3.0.0" + "@types/unist": "^2.0.2" }, "funding": { "type": "opencollective", "url": "https://opencollective.com/unified" } }, + "node_modules/vfile/node_modules/@types/unist": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/@types/unist/-/unist-3.0.3.tgz", + "integrity": "sha512-ko/gIFJRv177XgZsZcBwnqJN5x/Gien8qNOn0D5bQU/zAzVf9Zt3BlcUiLqhV9y4ARk0GbT3tnUiPNgnTXzc/Q==", + "dev": true, + "license": "MIT" + }, "node_modules/vfile/node_modules/vfile-message": { - "version": "4.0.2", - "resolved": "https://registry.npmjs.org/vfile-message/-/vfile-message-4.0.2.tgz", - "integrity": "sha512-jRDZ1IMLttGj41KcZvlrYAaI3CfqpLpfpf+Mfig13viT6NKvRzWZ+lXz0Y5D60w6uJIBAOGq9mSHf0gktF0duw==", + "version": "4.0.3", + "resolved": "https://registry.npmjs.org/vfile-message/-/vfile-message-4.0.3.tgz", + "integrity": "sha512-QTHzsGd1EhbZs4AsQ20JX1rC3cOlt/IWJruk893DfLRr57lcnOeMaWG4K0JrRta4mIJZKth2Au3mM3u03/JWKw==", "dev": true, "license": "MIT", "dependencies": { @@ -11416,9 +16557,9 @@ } }, "node_modules/vite": { - "version": "7.3.2", - "resolved": "https://registry.npmjs.org/vite/-/vite-7.3.2.tgz", - "integrity": "sha512-Bby3NOsna2jsjfLVOHKes8sGwgl4TT0E6vvpYgnAYDIF/tie7MRaFthmKuHx1NSXjiTueXH3do80FMQgvEktRg==", + "version": "7.3.5", + "resolved": "https://registry.npmjs.org/vite/-/vite-7.3.5.tgz", + "integrity": "sha512-KuOaNhcnGFN2zIPGA7wRmzF+lJA1sea7rHq17aiJ++9lzY1WWG6Jpwqwe1KNbRVPIqHmr8GLYx7jbrQcN/7/ww==", "dev": true, "license": "MIT", "dependencies": { @@ -11517,6 +16658,37 @@ "uuid": "dist/esm/bin/uuid" } }, + "node_modules/vite-plugin-pwa": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/vite-plugin-pwa/-/vite-plugin-pwa-1.3.0.tgz", + "integrity": "sha512-c5kMgN+ITrOtHXp8PAtk2uOIEea6XjP/unCGxOWWBzQ6qa65qj/awHg0wf+QF9E/2u9vh86LqxPwzEPNbM2r5A==", + "dev": true, + "license": "MIT", + "dependencies": { + "debug": "^4.3.6", + "pretty-bytes": "^6.1.1", + "tinyglobby": "^0.2.10", + "workbox-build": "^7.4.1", + "workbox-window": "^7.4.1" + }, + "engines": { + "node": ">=16.0.0" + }, + "funding": { + "url": "https://github.com/sponsors/antfu" + }, + "peerDependencies": { + "@vite-pwa/assets-generator": "^1.0.0", + "vite": "^3.1.0 || ^4.0.0 || ^5.0.0 || ^6.0.0 || ^7.0.0 || ^8.0.0", + "workbox-build": "^7.4.1", + "workbox-window": "^7.4.1" + }, + "peerDependenciesMeta": { + "@vite-pwa/assets-generator": { + "optional": true + } + } + }, "node_modules/vite/node_modules/fsevents": { "version": "2.3.3", "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz", @@ -11533,9 +16705,9 @@ } }, "node_modules/vitefu": { - "version": "1.1.1", - "resolved": "https://registry.npmjs.org/vitefu/-/vitefu-1.1.1.tgz", - "integrity": "sha512-B/Fegf3i8zh0yFbpzZ21amWzHmuNlLlmJT6n7bu5e+pCHUKQIfXSYokrqOBGEMMe9UG2sostKQF9mml/vYaWJQ==", + "version": "1.1.3", + "resolved": "https://registry.npmjs.org/vitefu/-/vitefu-1.1.3.tgz", + "integrity": "sha512-ub4okH7Z5KLjb6hDyjqrGXqWtWvoYdU3IGm/NorpgHncKoLTCfRIbvlhBm7r0YstIaQRYlp4yEbFqDcKSzXSSg==", "dev": true, "license": "MIT", "workspaces": [ @@ -11544,7 +16716,7 @@ "tests/projects/workspace/packages/*" ], "peerDependencies": { - "vite": "^3.0.0 || ^4.0.0 || ^5.0.0 || ^6.0.0 || ^7.0.0-beta.0" + "vite": "^3.0.0 || ^4.0.0 || ^5.0.0 || ^6.0.0 || ^7.0.0 || ^8.0.0" }, "peerDependenciesMeta": { "vite": { @@ -11666,49 +16838,11 @@ "dev": true, "license": "MIT", "dependencies": { - "@standard-schema/spec": "^1.1.0", - "@types/chai": "^5.2.2", - "@vitest/spy": "4.1.8", - "@vitest/utils": "4.1.8", - "chai": "^6.2.2", - "tinyrainbow": "^3.1.0" - }, - "funding": { - "url": "https://opencollective.com/vitest" - } - }, - "node_modules/vitest/node_modules/@vitest/pretty-format": { - "version": "4.1.8", - "resolved": "https://registry.npmjs.org/@vitest/pretty-format/-/pretty-format-4.1.8.tgz", - "integrity": "sha512-9GasEBxpZ1VYIpqHf/0+YGg121uSNwCKOJqIrTwWP/TB7DmFCiaBpNl3aPZzoLWfWkuqhbH8vJIVobZkvdo2cA==", - "dev": true, - "license": "MIT", - "dependencies": { - "tinyrainbow": "^3.1.0" - }, - "funding": { - "url": "https://opencollective.com/vitest" - } - }, - "node_modules/vitest/node_modules/@vitest/spy": { - "version": "4.1.8", - "resolved": "https://registry.npmjs.org/@vitest/spy/-/spy-4.1.8.tgz", - "integrity": "sha512-6EevtBp6OZOPF7bmz36HrGMeP3txgVSrgebWxHOafDXGkhIzfXK14f8KF6MuFfgXXUeHxmpD3BQxkV00/3s5mA==", - "dev": true, - "license": "MIT", - "funding": { - "url": "https://opencollective.com/vitest" - } - }, - "node_modules/vitest/node_modules/@vitest/utils": { - "version": "4.1.8", - "resolved": "https://registry.npmjs.org/@vitest/utils/-/utils-4.1.8.tgz", - "integrity": "sha512-uOJamYALNhfJ6iolExyQM40yIQwDqYnkKtQ5VCiSe17E33H0aQ/u+1GlRuz4LZBk6Mm3sg90G9hEbmEt37C1Zg==", - "dev": true, - "license": "MIT", - "dependencies": { - "@vitest/pretty-format": "4.1.8", - "convert-source-map": "^2.0.0", + "@standard-schema/spec": "^1.1.0", + "@types/chai": "^5.2.2", + "@vitest/spy": "4.1.8", + "@vitest/utils": "4.1.8", + "chai": "^6.2.2", "tinyrainbow": "^3.1.0" }, "funding": { @@ -11746,6 +16880,13 @@ "url": "https://github.com/sponsors/wooorm" } }, + "node_modules/webidl-conversions": { + "version": "4.0.2", + "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-4.0.2.tgz", + "integrity": "sha512-YQ+BmxuTgd6UXZW3+ICGfyqRyHXVlD5GtQr5+qjiNW7bF0cqrzX500HVXPBOvgXb5YnzDd+h0zqyv61KUD7+Sg==", + "dev": true, + "license": "BSD-2-Clause" + }, "node_modules/webpack-virtual-modules": { "version": "0.6.2", "resolved": "https://registry.npmjs.org/webpack-virtual-modules/-/webpack-virtual-modules-0.6.2.tgz", @@ -11757,6 +16898,7 @@ "version": "2.0.0", "resolved": "https://registry.npmjs.org/whatwg-encoding/-/whatwg-encoding-2.0.0.tgz", "integrity": "sha512-p41ogyeMUrw3jWclHWTQg1k05DSVXPLcVxRTYsXUk+ZooOCZLcoYgPZ/HL/D/N+uQPOtcp1me1WhBEaX02mhWg==", + "deprecated": "Use @exodus/bytes instead for a more spec-conformant and faster implementation", "dev": true, "license": "MIT", "dependencies": { @@ -11766,6 +16908,31 @@ "node": ">=12" } }, + "node_modules/whatwg-encoding/node_modules/iconv-lite": { + "version": "0.6.3", + "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.6.3.tgz", + "integrity": "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw==", + "dev": true, + "license": "MIT", + "dependencies": { + "safer-buffer": ">= 2.1.2 < 3.0.0" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/whatwg-url": { + "version": "7.1.0", + "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-7.1.0.tgz", + "integrity": "sha512-WUu7Rg1DroM7oQvGWfOiAK21n74Gg+T4elXEQYkOhtyLeWiJFoOGLXPKI/9gzIie9CtwVLm8wtw6YJdKyxSjeg==", + "dev": true, + "license": "MIT", + "dependencies": { + "lodash.sortby": "^4.7.0", + "tr46": "^1.0.1", + "webidl-conversions": "^4.0.2" + } + }, "node_modules/which": { "version": "2.0.2", "resolved": "https://registry.npmjs.org/which/-/which-2.0.2.tgz", @@ -11782,6 +16949,95 @@ "node": ">= 8" } }, + "node_modules/which-boxed-primitive": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/which-boxed-primitive/-/which-boxed-primitive-1.1.1.tgz", + "integrity": "sha512-TbX3mj8n0odCBFVlY8AxkqcHASw3L60jIuF8jFP78az3C2YhmGvqbHBpAjTRH2/xqYunrJ9g1jSyjCjpoWzIAA==", + "dev": true, + "license": "MIT", + "dependencies": { + "is-bigint": "^1.1.0", + "is-boolean-object": "^1.2.1", + "is-number-object": "^1.1.1", + "is-string": "^1.1.1", + "is-symbol": "^1.1.1" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/which-builtin-type": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/which-builtin-type/-/which-builtin-type-1.2.1.tgz", + "integrity": "sha512-6iBczoX+kDQ7a3+YJBnh3T+KZRxM/iYNPXicqk66/Qfm1b93iu+yOImkg0zHbj5LNOcNv1TEADiZ0xa34B4q6Q==", + "dev": true, + "license": "MIT", + "dependencies": { + "call-bound": "^1.0.2", + "function.prototype.name": "^1.1.6", + "has-tostringtag": "^1.0.2", + "is-async-function": "^2.0.0", + "is-date-object": "^1.1.0", + "is-finalizationregistry": "^1.1.0", + "is-generator-function": "^1.0.10", + "is-regex": "^1.2.1", + "is-weakref": "^1.0.2", + "isarray": "^2.0.5", + "which-boxed-primitive": "^1.1.0", + "which-collection": "^1.0.2", + "which-typed-array": "^1.1.16" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/which-collection": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/which-collection/-/which-collection-1.0.2.tgz", + "integrity": "sha512-K4jVyjnBdgvc86Y6BkaLZEN933SwYOuBFkdmBu9ZfkcAbdVbpITnDmjvZ/aQjRXQrv5EPkTnD1s39GiiqbngCw==", + "dev": true, + "license": "MIT", + "dependencies": { + "is-map": "^2.0.3", + "is-set": "^2.0.3", + "is-weakmap": "^2.0.2", + "is-weakset": "^2.0.3" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/which-typed-array": { + "version": "1.1.21", + "resolved": "https://registry.npmjs.org/which-typed-array/-/which-typed-array-1.1.21.tgz", + "integrity": "sha512-zbRA8cVm6io/d5W8uIe2hblzN76/Wm3v/yiythQvr+dpBWeqhPSWIDNj4zOyHi4zKbMK6DN34Xsr9jPHJERAEw==", + "dev": true, + "license": "MIT", + "dependencies": { + "available-typed-arrays": "^1.0.7", + "call-bind": "^1.0.9", + "call-bound": "^1.0.4", + "for-each": "^0.3.5", + "get-proto": "^1.0.1", + "gopd": "^1.2.0", + "has-tostringtag": "^1.0.2" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, "node_modules/why-is-node-running": { "version": "2.3.0", "resolved": "https://registry.npmjs.org/why-is-node-running/-/why-is-node-running-2.3.0.tgz", @@ -11809,6 +17065,357 @@ "node": ">=0.10.0" } }, + "node_modules/workbox-background-sync": { + "version": "7.4.1", + "resolved": "https://registry.npmjs.org/workbox-background-sync/-/workbox-background-sync-7.4.1.tgz", + "integrity": "sha512-HhT7KE8tOWDm02wRNshXUnUPofMlhenF2DBdUnDPOubhizzPeItkYTmAB6td1Z2cjYPa98vzEiPLEuzn5hN66g==", + "dev": true, + "license": "MIT", + "dependencies": { + "idb": "^7.0.1", + "workbox-core": "7.4.1" + } + }, + "node_modules/workbox-broadcast-update": { + "version": "7.4.1", + "resolved": "https://registry.npmjs.org/workbox-broadcast-update/-/workbox-broadcast-update-7.4.1.tgz", + "integrity": "sha512-uAlgslKLvbQY+suirIdnBCSYrcgBhjp81Nj4l1lj/Jmj0MJO2CJERnCJjT0GFVwmReV0N+zs78K6gqd5gr9/+A==", + "dev": true, + "license": "MIT", + "dependencies": { + "workbox-core": "7.4.1" + } + }, + "node_modules/workbox-build": { + "version": "7.4.1", + "resolved": "https://registry.npmjs.org/workbox-build/-/workbox-build-7.4.1.tgz", + "integrity": "sha512-SDhxIvEAde9Gy/5w4Yo1Jh/M49Z0qE3q0oteyE8zGq0DScxFqVBcCtIXFuLtmtxRQZCMbf0prco4VyEu3KBQuw==", + "dev": true, + "license": "MIT", + "dependencies": { + "@apideck/better-ajv-errors": "^0.3.1", + "@babel/core": "^7.24.4", + "@babel/preset-env": "^7.11.0", + "@babel/runtime": "^7.11.2", + "@rollup/plugin-babel": "^6.1.0", + "@rollup/plugin-node-resolve": "^16.0.3", + "@rollup/plugin-replace": "^6.0.3", + "@rollup/plugin-terser": "^1.0.0", + "@trickfilm400/rollup-plugin-off-main-thread": "^3.0.0-pre1", + "ajv": "^8.6.0", + "common-tags": "^1.8.0", + "eta": "^4.5.1", + "fast-json-stable-stringify": "^2.1.0", + "fs-extra": "^9.0.1", + "glob": "^11.0.1", + "pretty-bytes": "^5.3.0", + "rollup": "^4.53.3", + "source-map": "^0.8.0-beta.0", + "stringify-object": "^3.3.0", + "strip-comments": "^2.0.1", + "tempy": "^0.6.0", + "upath": "^1.2.0", + "workbox-background-sync": "7.4.1", + "workbox-broadcast-update": "7.4.1", + "workbox-cacheable-response": "7.4.1", + "workbox-core": "7.4.1", + "workbox-expiration": "7.4.1", + "workbox-google-analytics": "7.4.1", + "workbox-navigation-preload": "7.4.1", + "workbox-precaching": "7.4.1", + "workbox-range-requests": "7.4.1", + "workbox-recipes": "7.4.1", + "workbox-routing": "7.4.1", + "workbox-strategies": "7.4.1", + "workbox-streams": "7.4.1", + "workbox-sw": "7.4.1", + "workbox-window": "7.4.1" + }, + "engines": { + "node": ">=20.0.0" + } + }, + "node_modules/workbox-build/node_modules/@isaacs/cliui": { + "version": "9.0.0", + "resolved": "https://registry.npmjs.org/@isaacs/cliui/-/cliui-9.0.0.tgz", + "integrity": "sha512-AokJm4tuBHillT+FpMtxQ60n8ObyXBatq7jD2/JA9dxbDDokKQm8KMht5ibGzLVU9IJDIKK4TPKgMHEYMn3lMg==", + "dev": true, + "license": "BlueOak-1.0.0", + "engines": { + "node": ">=18" + } + }, + "node_modules/workbox-build/node_modules/balanced-match": { + "version": "4.0.4", + "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-4.0.4.tgz", + "integrity": "sha512-BLrgEcRTwX2o6gGxGOCNyMvGSp35YofuYzw9h1IMTRmKqttAZZVU67bdb9Pr2vUHA8+j3i2tJfjO6C6+4myGTA==", + "dev": true, + "license": "MIT", + "engines": { + "node": "18 || 20 || >=22" + } + }, + "node_modules/workbox-build/node_modules/brace-expansion": { + "version": "5.0.6", + "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-5.0.6.tgz", + "integrity": "sha512-kLpxurY4Z4r9sgMsyG0Z9uzsBlgiU/EFKhj/h91/8yHu0edo7XuixOIH3VcJ8kkxs6/jPzoI6U9Vj3WqbMQ94g==", + "dev": true, + "license": "MIT", + "dependencies": { + "balanced-match": "^4.0.2" + }, + "engines": { + "node": "18 || 20 || >=22" + } + }, + "node_modules/workbox-build/node_modules/glob": { + "version": "11.1.0", + "resolved": "https://registry.npmjs.org/glob/-/glob-11.1.0.tgz", + "integrity": "sha512-vuNwKSaKiqm7g0THUBu2x7ckSs3XJLXE+2ssL7/MfTGPLLcrJQ/4Uq1CjPTtO5cCIiRxqvN6Twy1qOwhL0Xjcw==", + "deprecated": "Old versions of glob are not supported, and contain widely publicized security vulnerabilities, which have been fixed in the current version. Please update. Support for old versions may be purchased (at exorbitant rates) by contacting i@izs.me", + "dev": true, + "license": "BlueOak-1.0.0", + "dependencies": { + "foreground-child": "^3.3.1", + "jackspeak": "^4.1.1", + "minimatch": "^10.1.1", + "minipass": "^7.1.2", + "package-json-from-dist": "^1.0.0", + "path-scurry": "^2.0.0" + }, + "bin": { + "glob": "dist/esm/bin.mjs" + }, + "engines": { + "node": "20 || >=22" + }, + "funding": { + "url": "https://github.com/sponsors/isaacs" + } + }, + "node_modules/workbox-build/node_modules/jackspeak": { + "version": "4.2.3", + "resolved": "https://registry.npmjs.org/jackspeak/-/jackspeak-4.2.3.tgz", + "integrity": "sha512-ykkVRwrYvFm1nb2AJfKKYPr0emF6IiXDYUaFx4Zn9ZuIH7MrzEZ3sD5RlqGXNRpHtvUHJyOnCEFxOlNDtGo7wg==", + "dev": true, + "license": "BlueOak-1.0.0", + "dependencies": { + "@isaacs/cliui": "^9.0.0" + }, + "engines": { + "node": "20 || >=22" + }, + "funding": { + "url": "https://github.com/sponsors/isaacs" + } + }, + "node_modules/workbox-build/node_modules/lru-cache": { + "version": "11.5.1", + "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-11.5.1.tgz", + "integrity": "sha512-RPimw/7aMdv2oqRrxKwvZXcPfwBrn/JZ2xYcY9Hus/6LaS3VOAKVWKWgNLCFSiOm1ESXinjsDlidVU7JlnCN2A==", + "dev": true, + "license": "BlueOak-1.0.0", + "engines": { + "node": "20 || >=22" + } + }, + "node_modules/workbox-build/node_modules/minimatch": { + "version": "10.2.5", + "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-10.2.5.tgz", + "integrity": "sha512-MULkVLfKGYDFYejP07QOurDLLQpcjk7Fw+7jXS2R2czRQzR56yHRveU5NDJEOviH+hETZKSkIk5c+T23GjFUMg==", + "dev": true, + "license": "BlueOak-1.0.0", + "dependencies": { + "brace-expansion": "^5.0.5" + }, + "engines": { + "node": "18 || 20 || >=22" + }, + "funding": { + "url": "https://github.com/sponsors/isaacs" + } + }, + "node_modules/workbox-build/node_modules/path-scurry": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/path-scurry/-/path-scurry-2.0.2.tgz", + "integrity": "sha512-3O/iVVsJAPsOnpwWIeD+d6z/7PmqApyQePUtCndjatj/9I5LylHvt5qluFaBT3I5h3r1ejfR056c+FCv+NnNXg==", + "dev": true, + "license": "BlueOak-1.0.0", + "dependencies": { + "lru-cache": "^11.0.0", + "minipass": "^7.1.2" + }, + "engines": { + "node": "18 || 20 || >=22" + }, + "funding": { + "url": "https://github.com/sponsors/isaacs" + } + }, + "node_modules/workbox-build/node_modules/pretty-bytes": { + "version": "5.6.0", + "resolved": "https://registry.npmjs.org/pretty-bytes/-/pretty-bytes-5.6.0.tgz", + "integrity": "sha512-FFw039TmrBqFK8ma/7OL3sDz/VytdtJr044/QUJtH0wK9lb9jLq9tJyIxUwtQJHwar2BqtiA4iCWSwo9JLkzFg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/workbox-build/node_modules/source-map": { + "version": "0.8.0-beta.0", + "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.8.0-beta.0.tgz", + "integrity": "sha512-2ymg6oRBpebeZi9UUNsgQ89bhx01TcTkmNTGnNO88imTmbSgy4nfujrgVEFKWpMTEGA11EDkTt7mqObTPdigIA==", + "deprecated": "The work that was done in this beta branch won't be included in future versions", + "dev": true, + "license": "BSD-3-Clause", + "dependencies": { + "whatwg-url": "^7.0.0" + }, + "engines": { + "node": ">= 8" + } + }, + "node_modules/workbox-cacheable-response": { + "version": "7.4.1", + "resolved": "https://registry.npmjs.org/workbox-cacheable-response/-/workbox-cacheable-response-7.4.1.tgz", + "integrity": "sha512-8xaFoJdDc2OjrlbbL3gEeBO1WKcMwRqwLRupgqahYXu75yXajPLuwrbXMrIGZuWYXrQwk0xDjOxZ/ujCy/oJYw==", + "dev": true, + "license": "MIT", + "dependencies": { + "workbox-core": "7.4.1" + } + }, + "node_modules/workbox-core": { + "version": "7.4.1", + "resolved": "https://registry.npmjs.org/workbox-core/-/workbox-core-7.4.1.tgz", + "integrity": "sha512-DT+vu46eh/2vRsSHTY4Xmc32Z1rr9PRlQUXr1Dx30ZuXRWwOsvZgGgcwxcasubQLQmbTNYZjv44LkBAQ4tT5tQ==", + "dev": true, + "license": "MIT" + }, + "node_modules/workbox-expiration": { + "version": "7.4.1", + "resolved": "https://registry.npmjs.org/workbox-expiration/-/workbox-expiration-7.4.1.tgz", + "integrity": "sha512-lRKUF7b+OGbeXkQk1s6MHXOa3d7Xxf7Of31W6c6hCfipfIyrtdWZ89stq21AHZMaoG7VNFoHply4Ox+rU31TWg==", + "dev": true, + "license": "MIT", + "dependencies": { + "idb": "^7.0.1", + "workbox-core": "7.4.1" + } + }, + "node_modules/workbox-google-analytics": { + "version": "7.4.1", + "resolved": "https://registry.npmjs.org/workbox-google-analytics/-/workbox-google-analytics-7.4.1.tgz", + "integrity": "sha512-Mks1JwLEt++ZAkF6sS1OpSh9RtAMIsiDgRpK+codiHGIPXeaUOgi4cPc3GFadUl8V5QPeypEk8Oxgl3HlwVzHw==", + "dev": true, + "license": "MIT", + "dependencies": { + "workbox-background-sync": "7.4.1", + "workbox-core": "7.4.1", + "workbox-routing": "7.4.1", + "workbox-strategies": "7.4.1" + } + }, + "node_modules/workbox-navigation-preload": { + "version": "7.4.1", + "resolved": "https://registry.npmjs.org/workbox-navigation-preload/-/workbox-navigation-preload-7.4.1.tgz", + "integrity": "sha512-C4KVsjPcYKJOhr631AxR9XoG2rLF3QiTk5aMv36MXOjtWvm8axwNFAtKUPGsWUwLXXAMgYM1En7fsvndaXeXRQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "workbox-core": "7.4.1" + } + }, + "node_modules/workbox-precaching": { + "version": "7.4.1", + "resolved": "https://registry.npmjs.org/workbox-precaching/-/workbox-precaching-7.4.1.tgz", + "integrity": "sha512-cdr/9qByww7yzEp7zg/qI4ukUrrNjQLgN+ONQRpjy/VqGQXwkgHwr00KksGJK8v0VifwDXBb8a4cWNZH71jn3Q==", + "dev": true, + "license": "MIT", + "dependencies": { + "workbox-core": "7.4.1", + "workbox-routing": "7.4.1", + "workbox-strategies": "7.4.1" + } + }, + "node_modules/workbox-range-requests": { + "version": "7.4.1", + "resolved": "https://registry.npmjs.org/workbox-range-requests/-/workbox-range-requests-7.4.1.tgz", + "integrity": "sha512-7i2oxAUE82gHdAJBCAQ04JzNOdRPqzuOzGfoUyJpFSmeqBNYGPrAH8GPoPjUQTfp+NycwrD2H68VtuF8qxv0vQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "workbox-core": "7.4.1" + } + }, + "node_modules/workbox-recipes": { + "version": "7.4.1", + "resolved": "https://registry.npmjs.org/workbox-recipes/-/workbox-recipes-7.4.1.tgz", + "integrity": "sha512-gnbVfmV4/TtmQaM4x9AtuXhcdstJsep3XMVeztOrQVPT+R6+6DeBjGTCQ7fFCXm+4GEHUA5VEBTyi5+4gWGeog==", + "dev": true, + "license": "MIT", + "dependencies": { + "workbox-cacheable-response": "7.4.1", + "workbox-core": "7.4.1", + "workbox-expiration": "7.4.1", + "workbox-precaching": "7.4.1", + "workbox-routing": "7.4.1", + "workbox-strategies": "7.4.1" + } + }, + "node_modules/workbox-routing": { + "version": "7.4.1", + "resolved": "https://registry.npmjs.org/workbox-routing/-/workbox-routing-7.4.1.tgz", + "integrity": "sha512-yubJGErZOusuidAenaL5ypfhQOa7urxP/f8E0ws7FPb4039RiWXUWBAyUkmUoOL/BcQGen3h0J8872d51IYxtA==", + "dev": true, + "license": "MIT", + "dependencies": { + "workbox-core": "7.4.1" + } + }, + "node_modules/workbox-strategies": { + "version": "7.4.1", + "resolved": "https://registry.npmjs.org/workbox-strategies/-/workbox-strategies-7.4.1.tgz", + "integrity": "sha512-GZxpaw9NbmOelj7667uZ2kpk5BFpOGbO4X0qjwh5ls8XQ8C+Lha5LQchTiUzsTFSS+NlUpftYAyOVXvQUrcqOQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "workbox-core": "7.4.1" + } + }, + "node_modules/workbox-streams": { + "version": "7.4.1", + "resolved": "https://registry.npmjs.org/workbox-streams/-/workbox-streams-7.4.1.tgz", + "integrity": "sha512-HWWtraKUbJknd9kgqGcpQ3G114HOPYvqs8HaJMDs2ebLNAimDkVDaWfAXE6Ybl+m8U6KsCE6pWyLYuigWmnAXw==", + "dev": true, + "license": "MIT", + "dependencies": { + "workbox-core": "7.4.1", + "workbox-routing": "7.4.1" + } + }, + "node_modules/workbox-sw": { + "version": "7.4.1", + "resolved": "https://registry.npmjs.org/workbox-sw/-/workbox-sw-7.4.1.tgz", + "integrity": "sha512-fez5f2DUlDJWTFYkCWQpY10N8gtztd849NswCbVFk0QlcSM4HT5A8x4g4ii650yem4I8tHY0R7JZahwp3ltIPw==", + "dev": true, + "license": "MIT" + }, + "node_modules/workbox-window": { + "version": "7.4.1", + "resolved": "https://registry.npmjs.org/workbox-window/-/workbox-window-7.4.1.tgz", + "integrity": "sha512-notZDH2u8VXaqyuD7xaqIfEFi6SRM4SUSd7ewe9PDsVqADuepxX2ZMY3uvuZGxzY5ZOsGC/vD3A/3smFtJt4/A==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/trusted-types": "^2.0.2", + "workbox-core": "7.4.1" + } + }, "node_modules/wrappy": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz", @@ -11817,9 +17424,9 @@ "license": "ISC" }, "node_modules/ws": { - "version": "8.20.1", - "resolved": "https://registry.npmjs.org/ws/-/ws-8.20.1.tgz", - "integrity": "sha512-It4dO0K5v//JtTXuPkfEOaI3uUN87iYPnqo/ZzqCoG3g8uhA66QUMs/SrM0YK7/NAu+r4LMh/9dq2A7k+rHs+w==", + "version": "8.21.0", + "resolved": "https://registry.npmjs.org/ws/-/ws-8.21.0.tgz", + "integrity": "sha512-Vsp28b7DRcimFQvrqu2Wek3z1iYxDCWqHYB8Qsnk/S4RfaCQzPGPyBNuVjJV3cd6UiKtUtp6sNM77gWvzcCH+g==", "dev": true, "license": "MIT", "engines": { @@ -11855,14 +17462,11 @@ } }, "node_modules/yallist": { - "version": "5.0.0", - "resolved": "https://registry.npmjs.org/yallist/-/yallist-5.0.0.tgz", - "integrity": "sha512-YgvUTfwqyc7UXVMrB+SImsVYSmTS8X/tSrtdNZMImM+n7+QTriRXyXim0mBrTXNeqzVF0KWGgHPeiyViFFrNDw==", + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/yallist/-/yallist-3.1.1.tgz", + "integrity": "sha512-a4UGQaWPH59mOXUYnAG2ewncQS4i4F43Tv3JoAM+s2VDAmS9NsK8GpDMLrCHPksFT7h3K6TOoUNn2pb7RoXx4g==", "dev": true, - "license": "BlueOak-1.0.0", - "engines": { - "node": ">=18" - } + "license": "ISC" }, "node_modules/yocto-queue": { "version": "0.1.0", @@ -11878,16 +17482,16 @@ } }, "node_modules/zimmerframe": { - "version": "1.1.2", - "resolved": "https://registry.npmjs.org/zimmerframe/-/zimmerframe-1.1.2.tgz", - "integrity": "sha512-rAbqEGa8ovJy4pyBxZM70hg4pE6gDgaQ0Sl9M3enG3I0d6H4XSAM3GeNGLKnsBpuijUow064sf7ww1nutC5/3w==", + "version": "1.1.4", + "resolved": "https://registry.npmjs.org/zimmerframe/-/zimmerframe-1.1.4.tgz", + "integrity": "sha512-B58NGBEoc8Y9MWWCQGl/gq9xBCe4IiKM0a2x7GZdQKOW5Exr8S1W24J6OgM1njK8xCRGvAJIL/MxXHf6SkmQKQ==", "dev": true, "license": "MIT" }, "node_modules/zod": { - "version": "4.2.1", - "resolved": "https://registry.npmjs.org/zod/-/zod-4.2.1.tgz", - "integrity": "sha512-0wZ1IRqGGhMP76gLqz8EyfBXKk0J2qo2+H3fi4mcUP/KtTocoX08nmIAHl1Z2kJIZbZee8KOpBCSNPRgauucjw==", + "version": "4.4.3", + "resolved": "https://registry.npmjs.org/zod/-/zod-4.4.3.tgz", + "integrity": "sha512-ytENFjIJFl2UwYglde2jchW2Hwm4GJFLDiSXWdTrJQBIN9Fcyp7n4DhxJEiWNAJMV1/BqWfW/kkg71UDcHJyTQ==", "dev": true, "license": "MIT", "funding": { @@ -11895,13 +17499,13 @@ } }, "node_modules/zod-to-json-schema": { - "version": "3.25.1", - "resolved": "https://registry.npmjs.org/zod-to-json-schema/-/zod-to-json-schema-3.25.1.tgz", - "integrity": "sha512-pM/SU9d3YAggzi6MtR4h7ruuQlqKtad8e9S0fmxcMi+ueAK5Korys/aWcV9LIIHTVbj01NdzxcnXSN+O74ZIVA==", + "version": "3.25.2", + "resolved": "https://registry.npmjs.org/zod-to-json-schema/-/zod-to-json-schema-3.25.2.tgz", + "integrity": "sha512-O/PgfnpT1xKSDeQYSCfRI5Gy3hPf91mKVDuYLUHZJMiDFptvP41MSnWofm8dnCm0256ZNfZIM7DSzuSMAFnjHA==", "dev": true, "license": "ISC", "peerDependencies": { - "zod": "^3.25 || ^4" + "zod": "^3.25.28 || ^4" } }, "node_modules/zwitch": { diff --git a/tools/ui/package.json b/tools/ui/package.json index 4f5ef4d64fa0..bcb4165d1015 100644 --- a/tools/ui/package.json +++ b/tools/ui/package.json @@ -4,8 +4,9 @@ "version": "1.0.0", "type": "module", "scripts": { + "build": "npm run build-pwa-assets && vite build", + "build-pwa-assets": "npx @vite-pwa/assets-generator --root . --config pwa-assets.config.ts && npx @vite-pwa/assets-generator --root . --config pwa-assets-dark.config.ts && node scripts/make-icons-circular.js", "dev": "bash scripts/dev.sh", - "build": "vite build", "preview": "vite preview", "prepare": "svelte-kit sync || echo ''", "check": "svelte-kit sync && svelte-check --tsconfig ./tsconfig.json", @@ -15,12 +16,15 @@ "lint": "prettier --check . && eslint .", "test": "npm run test:ui -- --run && npm run test:client -- --run && npm run test:unit -- --run && npm run test:e2e", "test:e2e": "playwright test", + "test:e2e:pwa": "playwright test tests/e2e/pwa.e2e.ts", "test:client": "vitest --project=client", "test:unit": "vitest --project=unit", + "test:unit:pwa": "vitest --project=unit --run tests/unit/pwa.spec.ts", + "test:pwa": "npm run test:unit:pwa && npm run test:e2e:pwa", "test:ui": "vitest --project=ui", "storybook": "storybook dev -p 6006", "build-storybook": "storybook build", - "cleanup": "rm -rf .svelte-kit build node_modules test-results" + "cleanup": "rm -rf .svelte-kit build node_modules test-results dist dev-dist debug-storybook.log static/pwa-*.png static/maskable-icon-*.png static/apple-touch-icon-*.png static/apple-splash-*.png static/favicon*.ico" }, "devDependencies": { "@chromatic-com/storybook": "5.0.0", @@ -41,29 +45,33 @@ "@tailwindcss/forms": "0.5.10", "@tailwindcss/typography": "0.5.16", "@tailwindcss/vite": "4.1.11", - "@types/node": "^24", + "@types/node": "24.13.0", + "@vite-pwa/assets-generator": "1.0.2", + "@vite-pwa/sveltekit": "1.1.0", "@vitest/browser": "4.1.8", "@vitest/browser-playwright": "4.1.8", "@vitest/coverage-v8": "4.1.8", "bits-ui": "2.18.1", "clsx": "2.1.1", - "dexie": "4.0.11", - "eslint": "9.39.2", + "dexie": "4.4.3", + "dompurify": "3.4.11", + "eslint": "9.39.4", "eslint-config-prettier": "10.1.8", - "eslint-plugin-storybook": "10.2.4", - "eslint-plugin-svelte": "3.15.0", - "globals": "16.3.0", + "eslint-plugin-storybook": "10.4.2", + "eslint-plugin-svelte": "3.19.0", + "fflate": "0.8.3", + "globals": "16.5.0", "highlight.js": "11.11.1", "http-server": "14.1.1", "mdast": "3.0.0", - "mdsvex": "0.12.6", + "mdsvex": "0.12.7", "mermaid": "11.15.0", "mode-watcher": "1.1.0", "pdfjs-dist": "5.4.54", "playwright": "1.56.1", - "prettier": "3.6.2", - "prettier-plugin-svelte": "3.4.0", - "prettier-plugin-tailwindcss": "0.6.14", + "prettier": "3.8.3", + "prettier-plugin-svelte": "4.1.0", + "prettier-plugin-tailwindcss": "0.8.0", "rehype-highlight": "7.0.2", "rehype-katex": "7.0.1", "rehype-stringify": "10.0.1", @@ -73,25 +81,25 @@ "remark-html": "16.0.1", "remark-math": "6.0.0", "remark-rehype": "11.1.2", - "sass": "1.93.3", - "storybook": "10.3.3", - "svelte": "5.55.7", - "svelte-check": "4.3.0", - "svelte-sonner": "1.0.5", - "tailwind-merge": "3.3.1", + "sass": "1.100.0", + "storybook": "10.4.2", + "svelte": "5.56.1", + "svelte-check": "4.6.0", + "svelte-sonner": "1.1.1", + "tailwind-merge": "3.6.0", "tailwind-variants": "3.2.2", - "tailwindcss": "4.1.11", - "tw-animate-css": "1.3.5", - "typescript": "5.8.3", - "typescript-eslint": "8.56.0", + "tailwindcss": "4.3.0", + "tw-animate-css": "1.4.0", + "typescript": "5.9.3", + "typescript-eslint": "8.60.1", "unified": "11.0.5", - "unist-util-visit": "5.0.0", + "unist-util-visit": "5.1.0", "uuid": "13.0.2", - "vite": "7.3.2", + "vite": "7.3.5", "vite-plugin-devtools-json": "0.2.1", "vitest": "4.1.8", "vitest-browser-svelte": "2.1.1", - "zod": "4.2.1" + "workbox-window": "7.4.1" }, "overrides": { "cookie": "1.1.1" diff --git a/tools/ui/playwright.config.ts b/tools/ui/playwright.config.ts index 460379238591..55bf38514047 100644 --- a/tools/ui/playwright.config.ts +++ b/tools/ui/playwright.config.ts @@ -1,11 +1,31 @@ -import { defineConfig } from '@playwright/test'; +import { defineConfig, devices } from '@playwright/test'; export default defineConfig({ + testDir: 'tests/e2e', + testMatch: ['**/*.e2e.ts'], + timeout: 30000, + expect: { + timeout: 5000 + }, + fullyParallel: true, + forbidOnly: !!process.env.CI, + retries: process.env.CI ? 2 : 0, + workers: process.env.CI ? 1 : undefined, + reporter: 'line', + use: { + baseURL: 'http://localhost:8181', + trace: 'on-first-retry' + }, + projects: [ + { + name: 'chromium', + use: { ...devices['Desktop Chrome'] } + } + ], webServer: { command: 'npm run build && npx http-server ./dist -p 8181', port: 8181, timeout: 120000, - reuseExistingServer: false - }, - testDir: 'tests/e2e' + reuseExistingServer: !process.env.CI + } }); diff --git a/tools/ui/pwa-assets-dark.config.ts b/tools/ui/pwa-assets-dark.config.ts new file mode 100644 index 000000000000..358c0ebc074e --- /dev/null +++ b/tools/ui/pwa-assets-dark.config.ts @@ -0,0 +1,27 @@ +import { defineConfig } from '@vite-pwa/assets-generator/config'; +import { FAVICON_COLORS, PWA_ASSET_GENERATOR } from './src/lib/constants/pwa'; +import { writeThemeFavicons } from './scripts/favicon-colorize'; + +writeThemeFavicons(FAVICON_COLORS.LIGHT, FAVICON_COLORS.DARK, { + padding: PWA_ASSET_GENERATOR.FAVICON_PADDING +}); + +export default defineConfig({ + headLinkOptions: { + preset: '2023' + }, + preset: { + transparent: { + sizes: [], + favicons: [[48, 'favicon-dark.ico']], + padding: PWA_ASSET_GENERATOR.FAVICON_PADDING + }, + maskable: { + sizes: [] + }, + apple: { + sizes: [] + } + }, + images: ['static/favicon-dark.svg'] +}); diff --git a/tools/ui/pwa-assets.config.ts b/tools/ui/pwa-assets.config.ts new file mode 100644 index 000000000000..b69884d94a9e --- /dev/null +++ b/tools/ui/pwa-assets.config.ts @@ -0,0 +1,68 @@ +import { + combinePresetAndAppleSplashScreens, + defineConfig, + minimal2023Preset +} from '@vite-pwa/assets-generator/config'; +import { readFileSync } from 'node:fs'; +import { resolve } from 'node:path'; +import { + THEME_COLORS, + PWA_GENERATOR_DEVICES, + PWA_ASSET_GENERATOR, + FAVICON_COLORS +} from './src/lib/constants/pwa'; +import { SplashOrientation } from './src/lib/enums/splash.enums'; +import { writeThemeFavicons } from './scripts/favicon-colorize'; + +writeThemeFavicons(FAVICON_COLORS.LIGHT, FAVICON_COLORS.DARK, { + padding: PWA_ASSET_GENERATOR.FAVICON_PADDING +}); + +export default defineConfig({ + headLinkOptions: { + preset: PWA_ASSET_GENERATOR.LINK_PRESET + }, + preset: combinePresetAndAppleSplashScreens( + { + ...minimal2023Preset, + // tiny margin so favicon.ico / pwa-*.png breathe inside the canvas + transparent: { + ...minimal2023Preset.transparent, + padding: PWA_ASSET_GENERATOR.FAVICON_PADDING + } + }, + { + padding: PWA_ASSET_GENERATOR.SPLASH_PADDING, + resizeOptions: { + background: THEME_COLORS.BACKGROUND_LIGHT, + fit: PWA_ASSET_GENERATOR.FIT_MODE + }, + darkResizeOptions: { + background: THEME_COLORS.BACKGROUND_DARK, + fit: PWA_ASSET_GENERATOR.FIT_MODE + }, + darkImageResolver: async (imageName: string) => { + if (imageName.endsWith('favicon.svg')) { + return readFileSync(resolve('static/favicon-dark.svg')); + } + }, + linkMediaOptions: { + log: true, + addMediaScreen: PWA_ASSET_GENERATOR.ADD_MEDIA_SCREEN, + basePath: PWA_ASSET_GENERATOR.BASE_PATH, + xhtml: PWA_ASSET_GENERATOR.XHTML + }, + png: { + compressionLevel: PWA_ASSET_GENERATOR.PNG_COMPRESSION_LEVEL, + quality: PWA_ASSET_GENERATOR.PNG_QUALITY + }, + name: (landscape, size, dark) => { + const orientation = landscape ? SplashOrientation.LANDSCAPE : SplashOrientation.PORTRAIT; + const darkPrefix = dark ? PWA_ASSET_GENERATOR.DARK_PREFIX : ''; + return `apple-splash-${orientation}-${darkPrefix}${size.width}x${size.height}.png`; + } + }, + PWA_GENERATOR_DEVICES + ), + images: ['static/favicon.svg'] +}); diff --git a/tools/ui/scripts/favicon-colorize.ts b/tools/ui/scripts/favicon-colorize.ts new file mode 100644 index 000000000000..e1872b7774f7 --- /dev/null +++ b/tools/ui/scripts/favicon-colorize.ts @@ -0,0 +1,107 @@ +import { mkdirSync, readFileSync, writeFileSync } from 'node:fs'; +import { dirname, resolve } from 'node:path'; +import { fileURLToPath } from 'node:url'; + +const HERE = dirname(fileURLToPath(import.meta.url)); +const PROJECT_ROOT = resolve(HERE, '..'); + +const DEFAULT_LOGO = resolve(PROJECT_ROOT, 'src/lib/assets/logo.svg'); +const DEFAULT_OUT_DIR = resolve(PROJECT_ROOT, 'static'); +const DEFAULT_OUT_LIGHT = resolve(DEFAULT_OUT_DIR, 'favicon.svg'); +const DEFAULT_OUT_DARK = resolve(DEFAULT_OUT_DIR, 'favicon-dark.svg'); + +const CURRENT_COLOR = 'currentColor'; + +export interface ColorizedFavicon { + light: string; + dark: string; +} + +export interface WriteThemeFaviconsOptions { + sourcePath?: string; + lightOutPath?: string; + darkOutPath?: string; + /** + * Fraction of the icon (0..1) to leave as an even margin on each side. + * Applied by wrapping the inner content in a `` so the + * source `src/lib/assets/logo.svg` is not modified. Pass 0 to disable. + */ + padding?: number; +} + +/** + * Replace every `currentColor` occurrence in the SVG with the given color. + * Pure: no filesystem access, so it is straightforward to unit-test. + */ +export function colorizeFaviconSvg( + svg: string, + lightColor: string, + darkColor: string +): ColorizedFavicon { + return { + light: svg.replaceAll(CURRENT_COLOR, lightColor), + dark: svg.replaceAll(CURRENT_COLOR, darkColor) + }; +} + +/** + * Shrink the inner SVG content uniformly and re-center it so `padding` (a + * 0..1 fraction) is reserved as equal margin on each side. Returns the input + * unchanged for non-positive padding, missing/invalid `viewBox`, or unexpected + * markup so the caller always gets a renderable SVG. + */ +export function padFaviconSvg(svg: string, padding: number): string { + if (!(padding > 0) || padding >= 1) return svg; + + const viewBoxMatch = svg.match(/viewBox\s*=\s*["']([^"']+)["']/i); + if (!viewBoxMatch) return svg; + + const parts = viewBoxMatch[1] + .trim() + .split(/[\s,]+/) + .map(Number); + if (parts.length !== 4 || parts.some((n) => !Number.isFinite(n))) return svg; + + const [, , width, height] = parts; + if (width <= 0 || height <= 0) return svg; + + const scale = 1 - padding; + const translateX = (padding * width) / 2; + const translateY = (padding * height) / 2; + + const openTagStart = svg.search(/', openTagStart); + if (openTagEnd === -1) return svg; + const closeStart = svg.lastIndexOf('`; + return `${openTag}${group}${inner}${closeTag}`; +} + +/** + * Read `src/lib/assets/logo.svg`, colorize it for both themes, and write + * the results to the static directory so the PWA asset generator can consume + * them. Paths can be overridden for tests. + */ +export function writeThemeFavicons( + lightColor: string, + darkColor: string, + { + sourcePath = DEFAULT_LOGO, + lightOutPath = DEFAULT_OUT_LIGHT, + darkOutPath = DEFAULT_OUT_DARK, + padding = 0 + }: WriteThemeFaviconsOptions = {} +): void { + const source = readFileSync(sourcePath, 'utf-8'); + const { light, dark } = colorizeFaviconSvg(source, lightColor, darkColor); + mkdirSync(dirname(lightOutPath), { recursive: true }); + writeFileSync(lightOutPath, padFaviconSvg(light, padding)); + writeFileSync(darkOutPath, padFaviconSvg(dark, padding)); +} diff --git a/tools/ui/scripts/git-hooks/pre-commit.sh b/tools/ui/scripts/git-hooks/pre-commit.sh index 1fa83efde596..208ef56fcd53 100755 --- a/tools/ui/scripts/git-hooks/pre-commit.sh +++ b/tools/ui/scripts/git-hooks/pre-commit.sh @@ -27,7 +27,7 @@ echo "Running pre-commit checks for llama-ui..." # Format only staged files staged_ui=$(git diff --cached --name-only -- tools/ui/) if [ -n "$staged_ui" ]; then - echo "$staged_ui" | xargs npx --no-install prettier --write + echo "$staged_ui" | xargs npm run format format_ok=$? # Re-stage formatted files git add tools/ui/ diff --git a/tools/ui/scripts/git-hooks/pre-push.sh b/tools/ui/scripts/git-hooks/pre-push.sh index 953d3a224304..66d79b950a36 100755 --- a/tools/ui/scripts/git-hooks/pre-push.sh +++ b/tools/ui/scripts/git-hooks/pre-push.sh @@ -57,6 +57,7 @@ if [ $lint_ok -ne 0 ]; then echo "❌ Lint failed" exit 1 fi + if [ $test_ok -ne 0 ]; then echo "❌ Tests failed" exit 1 diff --git a/tools/ui/scripts/make-icons-circular.js b/tools/ui/scripts/make-icons-circular.js new file mode 100644 index 000000000000..7dfd6521e576 --- /dev/null +++ b/tools/ui/scripts/make-icons-circular.js @@ -0,0 +1,137 @@ +#!/usr/bin/env node + +/** + * Apply circular mask to pwa-*.png icons. + * Uses the maskable icon as source (white bg, full logo) to avoid + * the small-colormap pwa icons looking bad when cropped to a circle. + * + * Usage: node scripts/make-icons-circular.js [--padding-pct <0-50>] [--scale-pct <50-100>] + * + * - padding-pct: percentage of icon size kept as padding around the circle (default: 25) + * - scale-pct: scale down the source image before cropping (default: 85) + * + * maskable-icon and apple-touch-icon are left untouched. + */ + +import sharp from 'sharp'; +import fs from 'fs'; +import path from 'path'; +import { fileURLToPath } from 'url'; + +const __filename = fileURLToPath(import.meta.url); +const __dirname = path.dirname(__filename); + +const STATIC_DIR = path.resolve(__dirname, '..', 'static'); + +const paddingPct = process.argv.reduce((acc, arg, i, args) => { + if (arg === '--padding-pct' && args[i + 1]) return parseFloat(args[i + 1]); + return acc; +}, 0); + +// Scale down the source image before cropping to circle +const scalePct = process.argv.reduce((acc, arg, i, args) => { + if (arg === '--scale-pct' && args[i + 1]) return parseFloat(args[i + 1]); + return acc; +}, 85); // default 85% - icon fills 85% of the circular area + +// Source for circular icons: the maskable icon (white bg, full logo) +const sourceIcon = 'maskable-icon-512x512.png'; +const targetIcons = ['pwa-64x64.png', 'pwa-192x192.png', 'pwa-512x512.png']; + +// maskable-icon and apple-touch-icon stay square +const untouchedIcons = ['maskable-icon-512x512.png', 'apple-touch-icon-180x180.png']; + +async function makeCircle(targetFilename) { + const targetPath = path.join(STATIC_DIR, targetFilename); + const sourcePath = path.join(STATIC_DIR, sourceIcon); + + if (!fs.existsSync(sourcePath)) { + console.log(`⏭️ ${sourceIcon} not found, skipping`); + return; + } + if (!fs.existsSync(targetPath)) { + console.log(`⏭️ ${targetFilename} not found, skipping`); + return; + } + + const metadata = await sharp(targetPath).metadata(); + const size = Math.max(metadata.width, metadata.height); + const radius = Math.floor((size * (1 - paddingPct / 100)) / 2); + const center = Math.floor(size / 2); + + // Build circular mask as RGBA buffer: white opaque circle on transparent bg + const maskBuf = Buffer.alloc(size * size * 4, 0); + for (let y = 0; y < size; y++) { + for (let x = 0; x < size; x++) { + const dx = x - center; + const dy = y - center; + const dist = Math.sqrt(dx * dx + dy * dy); + if (dist < radius) { + const i = (y * size + x) * 4; + maskBuf[i] = 255; + maskBuf[i + 1] = 255; + maskBuf[i + 2] = 255; + maskBuf[i + 3] = 255; + } + } + } + + const tmpMask = path.join(STATIC_DIR, '.mask-tmp.png'); + await sharp(maskBuf, { + raw: { width: size, height: size, channels: 4 } + }) + .png() + .toFile(tmpMask); + + // Step 1: Scale source relative to circle diameter (not full icon), composite centered onto white canvas of full size + const circleDiameter = Math.floor(size * (1 - paddingPct / 100)); + const scaledSize = Math.floor((circleDiameter * scalePct) / 100); + const offset = Math.floor((size - scaledSize) / 2); + + const scaledBuf = await sharp(sourcePath) + .resize(scaledSize, scaledSize, { + fit: 'cover', + background: { r: 255, g: 255, b: 255, alpha: 1 } + }) + .ensureAlpha() + .png() + .toBuffer(); + + // Step 2: Composite scaled image onto white background, then apply circular mask + const output = await sharp({ + create: { + width: size, + height: size, + channels: 4, + background: { r: 255, g: 255, b: 255, alpha: 1 } + } + }) + .composite([ + { input: scaledBuf, top: offset, left: offset }, + { input: tmpMask, top: 0, left: 0, blend: 'dest-in' } + ]) + .png() + .toBuffer(); + + fs.writeFileSync(targetPath, output); + fs.unlinkSync(tmpMask); + + console.log( + `✓ ${targetFilename} → circle from ${sourceIcon}, ${paddingPct}% padding (size=${size}, r=${radius}, scale=${scalePct}%, circleDiameter=${circleDiameter})` + ); +} + +async function main() { + console.log(`Circular mask: ${paddingPct}% padding, ${scalePct}% scale, source=${sourceIcon}\n`); + for (const icon of targetIcons) { + await makeCircle(icon); + } + + console.log('\nUnchanged:'); + for (const icon of untouchedIcons) { + const fp = path.join(STATIC_DIR, icon); + console.log(` ${icon} (${fs.existsSync(fp) ? fs.statSync(fp).size + ' bytes' : 'missing'})`); + } +} + +main(); diff --git a/tools/ui/scripts/vite-plugin-build-info.ts b/tools/ui/scripts/vite-plugin-build-info.ts new file mode 100644 index 000000000000..972ba3b664c8 --- /dev/null +++ b/tools/ui/scripts/vite-plugin-build-info.ts @@ -0,0 +1,41 @@ +import { writeFileSync, existsSync } from 'node:fs'; +import { resolve } from 'path'; +import type { Plugin } from 'vite'; +import { BUILD_CONFIG } from '../src/lib/constants/pwa'; + +let processed = false; + +const OUTPUT_DIR = process.env.LLAMA_UI_OUT_DIR ?? BUILD_CONFIG.OUTPUT_DIR; + +/** + * Write build.json with the llama.cpp release build number. + * + * LLAMA_BUILD_NUMBER is passed from CMake -> npm -> vite via env var. + * Used for display of the current llama-server release (e.g. "b1234"). + */ +export function buildInfoPlugin(): Plugin { + return { + name: 'llamacpp:build-info', + apply: 'build', + closeBundle() { + setTimeout(() => { + try { + if (processed) return; + processed = true; + + const buildNumber = process.env.LLAMA_BUILD_NUMBER || 'b0000'; + + const outDir = resolve(OUTPUT_DIR); + const indexPath = resolve(outDir, 'index.html'); + if (!existsSync(indexPath)) return; + + const buildJsonPath = resolve(outDir, 'build.json'); + writeFileSync(buildJsonPath, JSON.stringify({ version: buildNumber }), 'utf-8'); + console.log(`Created build.json (version: ${buildNumber})`); + } catch (error) { + console.error('Failed to write build.json:', error); + } + }, 100); + } + }; +} diff --git a/tools/ui/scripts/vite-plugin-llama-cpp-build.ts b/tools/ui/scripts/vite-plugin-llama-cpp-build.ts deleted file mode 100644 index 74e3de9baa9c..000000000000 --- a/tools/ui/scripts/vite-plugin-llama-cpp-build.ts +++ /dev/null @@ -1,105 +0,0 @@ -import { - readFileSync, - writeFileSync, - existsSync, - readdirSync, - copyFileSync, - rmSync, - unlinkSync -} from 'fs'; -import { resolve } from 'path'; -import type { Plugin } from 'vite'; - -const GUIDE_FOR_FRONTEND = ` - -`.trim(); - -const OUTPUT_DIR = process.env.LLAMA_UI_OUT_DIR ?? './dist'; - -export function llamaCppBuildPlugin(): Plugin { - return { - name: 'llamacpp:build', - apply: 'build', - closeBundle() { - setTimeout(() => { - try { - const outDir = resolve(OUTPUT_DIR); - const indexPath = resolve(outDir, 'index.html'); - if (!existsSync(indexPath)) return; - - let content = readFileSync(indexPath, 'utf-8'); - - // Inline favicon as base64 data URL - const faviconPath = resolve('static/favicon.svg'); - if (existsSync(faviconPath)) { - const faviconContent = readFileSync(faviconPath, 'utf-8'); - const faviconBase64 = Buffer.from(faviconContent).toString('base64'); - const faviconDataUrl = `data:image/svg+xml;base64,${faviconBase64}`; - content = content.replace(/href="[^"]*favicon\.svg"/g, `href="${faviconDataUrl}"`); - console.log('✓ Inlined favicon.svg as base64 data URL'); - } - - content = content.replace(/\r/g, ''); - content = GUIDE_FOR_FRONTEND + '\n' + content; - - // Keep the Vite hash as a query string so each build busts the browser cache - content = content.replace(/\/_app\/immutable\/bundle\.([^".]+)\.js/g, './bundle.js?$1'); - content = content.replace( - /\/_app\/immutable\/assets\/bundle\.([^".]+)\.css/g, - './bundle.css?$1' - ); - content = content.replace(/__sveltekit_[a-z0-9]+/g, '__sveltekit__'); - - writeFileSync(indexPath, content, 'utf-8'); - console.log('✓ Updated index.html'); - - // Copy bundle.*.js -> bundle.js at output root - const immutableDir = resolve(outDir, '_app/immutable'); - const bundleDir = resolve(outDir, '_app/immutable/assets'); - - if (existsSync(immutableDir)) { - const jsFiles = readdirSync(immutableDir).filter((f) => f.match(/^bundle\..+\.js$/)); - if (jsFiles.length > 0) { - copyFileSync(resolve(immutableDir, jsFiles[0]), resolve(outDir, 'bundle.js')); - // Normalize __sveltekit_ to __sveltekit__ in bundle.js - const bundleJsPath = resolve(outDir, 'bundle.js'); - let bundleJs = readFileSync(bundleJsPath, 'utf-8'); - bundleJs = bundleJs.replace(/__sveltekit_[a-z0-9]+/g, '__sveltekit__'); - writeFileSync(bundleJsPath, bundleJs, 'utf-8'); - console.log(`✓ Copied ${jsFiles[0]} -> bundle.js`); - } - } - - // Copy bundle.*.css -> bundle.css at output root - if (existsSync(bundleDir)) { - const cssFiles = readdirSync(bundleDir).filter((f) => f.match(/^bundle\..+\.css$/)); - if (cssFiles.length > 0) { - copyFileSync(resolve(bundleDir, cssFiles[0]), resolve(outDir, 'bundle.css')); - console.log(`✓ Copied ${cssFiles[0]} -> bundle.css`); - } - } - - // Cleanup: remove _app directory, favicon.svg, and legacy index.html.gz - const appDir = resolve(outDir, '_app'); - if (existsSync(appDir)) { - rmSync(appDir, { recursive: true, force: true }); - console.log('✓ Removed _app directory'); - } - - const faviconOut = resolve(outDir, 'favicon.svg'); - if (existsSync(faviconOut)) { - unlinkSync(faviconOut); - console.log('✓ Removed favicon.svg'); - } - } catch (error) { - console.error('Failed to process build output:', error); - } - }, 100); - } - }; -} diff --git a/tools/ui/scripts/vite-plugin-relativize-base.ts b/tools/ui/scripts/vite-plugin-relativize-base.ts new file mode 100644 index 000000000000..ce2f1b6e9fab --- /dev/null +++ b/tools/ui/scripts/vite-plugin-relativize-base.ts @@ -0,0 +1,61 @@ +import { readFileSync, writeFileSync, existsSync } from 'node:fs'; +import { resolve } from 'path'; +import type { Plugin } from 'vite'; +import { BUILD_CONFIG } from '../src/lib/constants/pwa'; + +let processed = false; + +const OUTPUT_DIR = process.env.LLAMA_UI_OUT_DIR ?? BUILD_CONFIG.OUTPUT_DIR; + +function rewrite(path: string, pairs: [string, string][]): void { + if (!existsSync(path)) { + return; + } + const text = readFileSync(path, 'utf-8'); + let out = text; + for (const [from, to] of pairs) { + out = out.split(from).join(to); + } + if (out !== text) { + writeFileSync(path, out, 'utf-8'); + } +} + +/** + * Relativize SvelteKit absolute base refs so the build is relocatable under any subpath. + * + * SvelteKit bakes root absolute /_app/ paths into the SPA fallback because paths.relative + * does not apply to a depth agnostic fallback page. Rewriting to ./_app/ lets a plain + * recursive copy of the output into /any/subdir/ resolve assets against the document URL. + * Runs after adapter-static writes index.html and the PWA plugin writes sw.js, deferred the + * same way as buildInfoPlugin so the emitted files exist. + */ +export function relativizeBasePlugin(): Plugin { + return { + name: 'llamacpp:relativize-base', + apply: 'build', + closeBundle() { + setTimeout(() => { + try { + if (processed) return; + processed = true; + + const outDir = resolve(OUTPUT_DIR); + + // index.html: modulepreload, stylesheet and bootstrap import reference "/_app/ + rewrite(resolve(outDir, 'index.html'), [['"/_app/', '"./_app/']]); + + // sw.js: the only absolute entries are the navigate fallback precache key and handler + rewrite(resolve(outDir, 'sw.js'), [ + ['{url:"/"', '{url:"./"'], + ['createHandlerBoundToURL("/"', 'createHandlerBoundToURL("./"'] + ]); + + console.log('Relativized base refs in index.html and sw.js'); + } catch (error) { + console.error('Failed to relativize base refs:', error); + } + }, 100); + } + }; +} diff --git a/tools/ui/scripts/vite-plugin-splash-screen.ts b/tools/ui/scripts/vite-plugin-splash-screen.ts new file mode 100644 index 000000000000..059ce4920bc5 --- /dev/null +++ b/tools/ui/scripts/vite-plugin-splash-screen.ts @@ -0,0 +1,115 @@ +import { readdirSync, readFileSync, writeFileSync, existsSync } from 'node:fs'; +import { resolve } from 'path'; +import type { Plugin } from 'vite'; +import { TAB, NEWLINE } from '../src/lib/constants/code'; +import { APPLE_DEVICES, BUILD_CONFIG, REGEX_PATTERNS, SPLASH_LINK } from '../src/lib/constants/pwa'; +import type { SplashDimensions } from '../src/lib/types'; +import { SplashOrientation } from '../src/lib/enums/splash.enums'; + +let processed = false; + +const OUTPUT_DIR = process.env.LLAMA_UI_OUT_DIR ?? BUILD_CONFIG.OUTPUT_DIR; + +/** + * Generate iOS splash screen tags from generated apple-splash-*.png files. + * Returns an array of HTML link strings to be injected into the page head. + */ +export function generateSplashScreenLinks(outDir: string): string[] { + const files = readdirSync(outDir).filter((f) => f.match(REGEX_PATTERNS.SPLASH_FILE)); + if (files.length === 0) return []; + + const dimMap = new Map(); + for (const [dims, spec] of Object.entries(APPLE_DEVICES)) { + const [w, h] = dims.split('x').map(Number); + // logical-point dimensions + dimMap.set(`${w}x${h}`, { deviceW: spec.width, deviceH: spec.height, dpr: spec.dpr }); + dimMap.set(`${h}x${w}`, { deviceW: spec.width, deviceH: spec.height, dpr: spec.dpr }); + // pixel dimensions (used by actual generated splash files) + dimMap.set(`${w * spec.dpr}x${h * spec.dpr}`, { + deviceW: spec.width, + deviceH: spec.height, + dpr: spec.dpr + }); + dimMap.set(`${h * spec.dpr}x${w * spec.dpr}`, { + deviceW: spec.width, + deviceH: spec.height, + dpr: spec.dpr + }); + } + + const lightLinks: string[] = []; + const darkLinks: string[] = []; + + for (const file of files) { + const match = file.match(REGEX_PATTERNS.SPLASH_FILE); + if (!match) continue; + const orientation = match[1] as SplashOrientation; + const isDark = !!match[2]; + const pixelW = parseInt(match[3]); + const pixelH = parseInt(match[4]); + + const key = `${pixelW}x${pixelH}`; + const spec = dimMap.get(key); + if (!spec) { + console.warn(`Unknown splash screen dimensions: ${key} (${file})`); + continue; + } + + const { deviceW, deviceH, dpr } = spec; + const media = `screen and (device-width: ${deviceW}px) and (device-height: ${deviceH}px) and (-webkit-device-pixel-ratio: ${dpr}) and (orientation: ${orientation})`; + const href = `./${file}`; + + if (isDark) { + darkLinks.push( + `${SPLASH_LINK.HTML} media="${media}${SPLASH_LINK.DARK_MEDIA_SUFFIX}" href="${href}">` + ); + } else { + lightLinks.push(`${SPLASH_LINK.HTML} media="${media}" href="${href}">`); + } + } + + return [...lightLinks, ...darkLinks]; +} + +export function splashScreenPlugin(): Plugin { + return { + name: 'llamacpp:splash-screen', + apply: 'build', + closeBundle() { + setTimeout(() => { + try { + if (processed) return; + processed = true; + + const outDir = resolve(OUTPUT_DIR); + const indexPath = resolve(outDir, 'index.html'); + if (!existsSync(indexPath)) return; + + let content = readFileSync(indexPath, 'utf-8'); + + // Inject iOS splash screen tags into . + // The @vite-pwa/assets-generator generates apple-splash-*.png files; + // this scans them and creates the tags SvelteKit needs. + const splashLinks = generateSplashScreenLinks(outDir); + if (splashLinks.length > 0) { + console.log(`Generated ${splashLinks.length} apple-splash link tags`); + const splashHtml = splashLinks.map((l) => TAB + TAB + l).join(NEWLINE); + content = content.replace( + REGEX_PATTERNS.HEAD_CLOSE, + splashHtml + NEWLINE + TAB + TAB + '' + ); + } + + // Remove trailing \r from Windows line endings + content = content.replace(/\r/g, ''); + content = BUILD_CONFIG.GUIDE_COMMENT + NEWLINE + content; + + writeFileSync(indexPath, content, 'utf-8'); + console.log('Updated index.html'); + } catch (error) { + console.error('Failed to process build output:', error); + } + }, 100); + } + }; +} diff --git a/tools/ui/src/app.css b/tools/ui/src/app.css index 9254a96df740..8c4056477dbf 100644 --- a/tools/ui/src/app.css +++ b/tools/ui/src/app.css @@ -48,6 +48,7 @@ --chat-form-area-height: 8rem; --chat-form-area-offset: 2rem; + --chat-form-padding-top: 6rem; --max-message-height: max(24rem, min(80dvh, calc(100dvh - var(--chat-form-area-height) - 12rem))); } @@ -55,6 +56,7 @@ :root { --chat-form-area-height: 24rem; --chat-form-area-offset: 12rem; + --chat-form-padding-top: 6rem; } } @@ -141,7 +143,6 @@ @apply bg-background text-foreground; scrollbar-width: thin; scrollbar-gutter: stable; - overflow: hidden; /* Added due to Mermaid rendering somehow causing the double scrollbar */ } /* Global scrollbar styling - visible only on hover */ @@ -193,3 +194,7 @@ scrollbar-width: none; } } + +.mermaidTooltip { + display: none !important; +} diff --git a/tools/ui/src/app.d.ts b/tools/ui/src/app.d.ts index ec65952e9a7d..5264e5cc4d72 100644 --- a/tools/ui/src/app.d.ts +++ b/tools/ui/src/app.d.ts @@ -1,6 +1,9 @@ // See https://svelte.dev/docs/kit/types#app.d.ts // for information about these interfaces +import 'vite-plugin-pwa/pwa-assets'; +import 'vite-plugin-pwa/svelte'; + // Import chat types from dedicated module import type { @@ -16,6 +19,10 @@ import type { ApiErrorResponse, ApiLlamaCppServerProps, ApiModelDataEntry, + ApiModelLoadStage, + ApiModelsSseProgress, + ApiModelsSseData, + ApiModelsSseEvent, ApiModelListResponse, ApiProcessingState, ApiRouterModelMeta, @@ -49,6 +56,7 @@ import type { // Model types ModelModalities, ModelOption, + ModelLoadProgress, // Settings types SettingsChatServiceOptions, SettingsConfigValue, @@ -80,6 +88,10 @@ declare global { ApiErrorResponse, ApiLlamaCppServerProps, ApiModelDataEntry, + ApiModelLoadStage, + ApiModelsSseProgress, + ApiModelsSseData, + ApiModelsSseEvent, ApiModelListResponse, ApiProcessingState, ApiRouterModelMeta, @@ -117,6 +129,7 @@ declare global { // Model types ModelModalities, ModelOption, + ModelLoadProgress, // Settings types SettingsChatServiceOptions, SettingsConfigValue, diff --git a/tools/ui/src/app.html b/tools/ui/src/app.html index 1391f884880d..e1de226dcb81 100644 --- a/tools/ui/src/app.html +++ b/tools/ui/src/app.html @@ -2,10 +2,20 @@ - - + + + + + + + + %sveltekit.head% +
%sveltekit.body%
diff --git a/tools/ui/src/lib/actions/fade-in-view.svelte.ts b/tools/ui/src/lib/actions/fade-in-view.svelte.ts index d930448050d5..9a5918131aa9 100644 --- a/tools/ui/src/lib/actions/fade-in-view.svelte.ts +++ b/tools/ui/src/lib/actions/fade-in-view.svelte.ts @@ -10,9 +10,9 @@ import { isElementInViewport } from '$lib/utils/viewport'; */ export function fadeInView( node: HTMLElement, - options: { duration?: number; y?: number; skipIfVisible?: boolean } = {} + options: { duration?: number; y?: number; delay?: number; skipIfVisible?: boolean } = {} ) { - const { duration = 300, y = 0, skipIfVisible = false } = options; + const { duration = 300, y = 0, delay = 0, skipIfVisible = false } = options; if (skipIfVisible && isElementInViewport(node)) { return; @@ -27,10 +27,12 @@ export function fadeInView( (entries) => { for (const entry of entries) { if (entry.isIntersecting) { - requestAnimationFrame(() => { - node.style.opacity = '1'; - node.style.transform = 'translateY(0)'; - }); + setTimeout(() => { + requestAnimationFrame(() => { + node.style.opacity = '1'; + node.style.transform = 'translateY(0)'; + }); + }, delay); observer.disconnect(); } } diff --git a/tools/ui/src/lib/assets/logo.svg b/tools/ui/src/lib/assets/logo.svg new file mode 100644 index 000000000000..05424790afea --- /dev/null +++ b/tools/ui/src/lib/assets/logo.svg @@ -0,0 +1,7 @@ + + + + + + + diff --git a/tools/ui/src/lib/components/app/actions/ActionIcon.svelte b/tools/ui/src/lib/components/app/actions/ActionIcon.svelte index f156df66998a..8a86557bb98d 100644 --- a/tools/ui/src/lib/components/app/actions/ActionIcon.svelte +++ b/tools/ui/src/lib/components/app/actions/ActionIcon.svelte @@ -8,12 +8,13 @@ ariaLabel?: string; class?: string; disabled?: boolean; + href?: string; icon: Component; iconSize?: string; - onclick: (e?: MouseEvent) => void; + onclick?: (e?: MouseEvent) => void; size?: ButtonSize; stopPropagationOnClick?: boolean; - tooltip: string; + tooltip?: string; variant?: ButtonVariant; tooltipSide?: TooltipSide; } @@ -22,6 +23,7 @@ icon, tooltip, variant = 'ghost', + href = '', size = 'sm', class: className = '', disabled = false, @@ -31,34 +33,49 @@ onclick, ariaLabel }: Props = $props(); + + let innerWidth = $state(0); + const showTooltip = $derived(!!tooltip && innerWidth > 768); - - - - {#snippet child({ props })} - +{/snippet} + +{#if showTooltip} + + + + {#snippet child({ props })} + {@render button(props)} + {/snippet} + - onclick?.(e); - }} - class="h-6 w-6 p-0 {className} flex hover:bg-transparent data-[state=open]:bg-transparent!" - aria-label={ariaLabel || tooltip} - > - {#if icon} - {@const IconComponent = icon} - - {/if} - - {/snippet} - + +

{tooltip}

+
+
+{:else} + {@render button({ href })} +{/if} - -

{tooltip}

-
- + diff --git a/tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsListItem/ChatAttachmentsListItemMcpPrompt.svelte b/tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsListItem/ChatAttachmentsListItemMcpPrompt.svelte index 636e93f22111..c55dfdec7b82 100644 --- a/tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsListItem/ChatAttachmentsListItemMcpPrompt.svelte +++ b/tools/ui/src/lib/components/app/chat/ChatAttachments/ChatAttachmentsList/ChatAttachmentsListItem/ChatAttachmentsListItemMcpPrompt.svelte @@ -33,7 +33,7 @@ {#if !readonly && onRemove}
onRemove?.()} />
diff --git a/tools/ui/src/lib/components/app/chat/ChatForm/ChatForm.svelte b/tools/ui/src/lib/components/app/chat/ChatForm/ChatForm.svelte index ed26f9ea58e4..9b2077b8dcbd 100644 --- a/tools/ui/src/lib/components/app/chat/ChatForm/ChatForm.svelte +++ b/tools/ui/src/lib/components/app/chat/ChatForm/ChatForm.svelte @@ -494,7 +494,7 @@ />
{#if hasMcpPromptsSupport} - diff --git a/tools/ui/src/lib/components/app/navigation/DesktopIconStrip.svelte b/tools/ui/src/lib/components/app/navigation/DesktopIconStrip.svelte deleted file mode 100644 index e92b9528a61b..000000000000 --- a/tools/ui/src/lib/components/app/navigation/DesktopIconStrip.svelte +++ /dev/null @@ -1,84 +0,0 @@ - - - - - - diff --git a/tools/ui/src/lib/components/app/navigation/SidebarNavigation/SidebarNavigation.svelte b/tools/ui/src/lib/components/app/navigation/SidebarNavigation/SidebarNavigation.svelte index 1fa7722f2406..fe503f53baa1 100644 --- a/tools/ui/src/lib/components/app/navigation/SidebarNavigation/SidebarNavigation.svelte +++ b/tools/ui/src/lib/components/app/navigation/SidebarNavigation/SidebarNavigation.svelte @@ -1,40 +1,76 @@ - $effect(() => { - if (!sidebar.open) { - isSearchModeActive = false; - searchQuery = ''; - openedForSearch = false; - } - }); + + +{#if innerWidth > 768 || (!page.url.hash.includes(ROUTES.SETTINGS) && !page.url.hash.includes(ROUTES.MCP_SERVERS) && !page.url.hash.includes(ROUTES.SEARCH))} + +{/if} - async function selectConversation(id: string) { - if (isSearchModeActive) { - isSearchModeActive = false; - searchQuery = ''; + diff --git a/tools/ui/src/lib/components/app/navigation/SidebarNavigation/SidebarNavigationActions.svelte b/tools/ui/src/lib/components/app/navigation/SidebarNavigation/SidebarNavigationActions.svelte index f0d63970eea8..f118a68dfc67 100644 --- a/tools/ui/src/lib/components/app/navigation/SidebarNavigation/SidebarNavigationActions.svelte +++ b/tools/ui/src/lib/components/app/navigation/SidebarNavigation/SidebarNavigationActions.svelte @@ -1,39 +1,86 @@ @@ -41,56 +88,109 @@ {/snippet} -
- {#if isSearchModeActive} +{#if isSearchModeActive} +
e.key === 'Escape' && handleSearchModeDeactivate()} placeholder="Search conversations..." - {isCancelAlwaysVisible} /> - {:else} - {#each SIDEBAR_ACTIONS_ITEMS as item (item.route)} - {#if !item.route} - - {:else} - +
+{:else if isExpandedMode || isOnMobile} +
+ {#each SIDEBAR_ACTIONS_ITEMS as item, i (item.tooltip)} + {@const isActive = isItemActive(item)} + {@const isSearchOnMobile = item.icon === Search && isMobile.current} + {@const itemHref = isSearchOnMobile ? ROUTES.SEARCH : item.route} + {@const itemOnClick = item.route + ? () => { + onNewChat?.(); + goto(item.route!); + } + : isSearchOnMobile + ? undefined + : onSearchClick} + {@const itemTransition = { + duration: ICON_STRIP_TRANSITION_DURATION, + delay: !initialized + ? ICON_STRIP_TRANSITION_DELAY_MULTIPLIER + i * ICON_STRIP_TRANSITION_DELAY_MULTIPLIER + : 0, + easing: circIn + }} + + {#if showIcons} +
+ +
+ {/if} + {/each} +
+{:else} + +
+{/if} diff --git a/tools/ui/src/lib/components/app/navigation/SidebarNavigation/SidebarNavigationConversationItem.svelte b/tools/ui/src/lib/components/app/navigation/SidebarNavigation/SidebarNavigationConversationItem.svelte index b1c2b78f65ea..2c1b9adf21ba 100644 --- a/tools/ui/src/lib/components/app/navigation/SidebarNavigation/SidebarNavigationConversationItem.svelte +++ b/tools/ui/src/lib/components/app/navigation/SidebarNavigation/SidebarNavigationConversationItem.svelte @@ -39,7 +39,6 @@ depth = 0 }: Props = $props(); - let renderActionsDropdown = $state(false); let dropdownOpen = $state(false); let isLoading = $derived(getAllLoadingChats().includes(conversation.id)); @@ -71,26 +70,10 @@ } } - function handleMouseLeave() { - if (!dropdownOpen) { - renderActionsDropdown = false; - } - } - - function handleMouseOver() { - renderActionsDropdown = true; - } - function handleSelect() { onSelect?.(conversation.id); } - $effect(() => { - if (!dropdownOpen) { - renderActionsDropdown = false; - } - }); - onMount(() => { document.addEventListener('edit-active-conversation', handleGlobalEditEvent as EventListener); @@ -103,23 +86,19 @@ }); - -
{#if depth > 0} @@ -130,7 +109,7 @@ @@ -146,18 +125,15 @@ {#if isLoading} -
e.key === 'Enter' && handleStop(e)} - role="button" - tabindex="0" aria-label="Stop generation" >
+
@@ -169,52 +145,50 @@
- {#if renderActionsDropdown} -
- { - e.stopPropagation(); - handleTogglePin(); - } - }, - { - icon: Pencil, - label: 'Edit', - onclick: handleEdit, - shortcut: ['shift', 'cmd', 'e'] - }, - { - icon: Download, - label: 'Export', - onclick: (e: Event) => { - e.stopPropagation(); - conversationsStore.downloadConversation(conversation.id); - }, - shortcut: ['shift', 'cmd', 's'] - }, - { - icon: Trash2, - label: 'Delete', - onclick: handleDelete, - variant: 'destructive', - shortcut: ['shift', 'cmd', 'd'], - separator: true +
+ { + e.stopPropagation(); + handleTogglePin(); } - ]} - /> -
- {/if} - + }, + { + icon: Pencil, + label: 'Edit', + onclick: handleEdit, + shortcut: ['shift', 'cmd', 'e'] + }, + { + icon: Download, + label: 'Export', + onclick: (e: Event) => { + e.stopPropagation(); + conversationsStore.downloadConversation(conversation.id); + }, + shortcut: ['shift', 'cmd', 's'] + }, + { + icon: Trash2, + label: 'Delete', + onclick: handleDelete, + variant: 'destructive', + shortcut: ['shift', 'cmd', 'd'], + separator: true + } + ]} + /> +
+
${markup}` : ''; +} diff --git a/tools/ui/src/routes/(chat)/chat/[id]/+page.svelte b/tools/ui/src/routes/(chat)/chat/[id]/+page.svelte index e31d4443ef39..f14553c90a90 100644 --- a/tools/ui/src/routes/(chat)/chat/[id]/+page.svelte +++ b/tools/ui/src/routes/(chat)/chat/[id]/+page.svelte @@ -4,7 +4,7 @@ import { afterNavigate } from '$app/navigation'; import { DialogModelNotAvailable } from '$lib/components/app'; import { APP_NAME, ROUTES } from '$lib/constants'; - import { chatStore, isLoading } from '$lib/stores/chat.svelte'; + import { chatStore } from '$lib/stores/chat.svelte'; import { conversationsStore, activeConversation } from '$lib/stores/conversations.svelte'; import { modelsStore, modelOptions } from '$lib/stores/models.svelte'; @@ -83,7 +83,7 @@ // Skip loading if this conversation is already active (e.g., just created) if (activeConversation()?.id === chatId) { - // Still handle URL params even if conversation is active + void chatStore.discoverActiveStream(chatId); if ((qParam !== null || modelParam !== null) && !urlParamsProcessed) { handleUrlParams(); } @@ -92,35 +92,33 @@ (async () => { const success = await conversationsStore.loadConversation(chatId); - if (success) { - chatStore.syncLoadingStateForChat(chatId); - - // Handle URL params after conversation is loaded - if ((qParam !== null || modelParam !== null) && !urlParamsProcessed) { - await handleUrlParams(); - } - } else { + if (!success) { await goto(ROUTES.START); + return; + } + chatStore.syncLoadingStateForChat(chatId); + // server probe (with localStorage fallback) and attach + await chatStore.discoverActiveStream(chatId); + + if ((qParam !== null || modelParam !== null) && !urlParamsProcessed) { + await handleUrlParams(); } })(); } }); $effect(() => { - if (typeof window !== 'undefined') { - const handleBeforeUnload = () => { - if (isLoading()) { - console.log('Page unload detected while streaming - aborting stream'); - chatStore.stopGeneration(); - } - }; - - window.addEventListener('beforeunload', handleBeforeUnload); - - return () => { - window.removeEventListener('beforeunload', handleBeforeUnload); - }; - } + if (typeof window === 'undefined' || typeof document === 'undefined') return; + + // when the tab comes back to the foreground, re-run discovery to catch any race + // where the initial mount probe missed an active session + const onVisibility = () => { + if (document.visibilityState !== 'visible') return; + if (!chatId) return; + void chatStore.discoverActiveStream(chatId); + }; + document.addEventListener('visibilitychange', onVisibility); + return () => document.removeEventListener('visibilitychange', onVisibility); }); diff --git a/tools/ui/src/routes/+layout.svelte b/tools/ui/src/routes/+layout.svelte index aa023840bdeb..38848786e998 100644 --- a/tools/ui/src/routes/+layout.svelte +++ b/tools/ui/src/routes/+layout.svelte @@ -6,16 +6,13 @@ import { page } from '$app/state'; import { untrack } from 'svelte'; import { onMount } from 'svelte'; - import { fade } from 'svelte/transition'; - import { - DesktopIconStrip, - DialogConversationTitleUpdate, - SidebarNavigation - } from '$lib/components/app'; + import { SidebarNavigation, DialogConversationTitleUpdate } from '$lib/components/app'; + import { PwaMetaTags, PwaRefreshAlert } from '$lib/components/pwa'; + import { pwaAssetsHead } from 'virtual:pwa-assets/head'; + import { chatStore } from '$lib/stores/chat.svelte'; import { conversationsStore } from '$lib/stores/conversations.svelte'; - import * as Sidebar from '$lib/components/ui/sidebar/index.js'; import * as Tooltip from '$lib/components/ui/tooltip'; import { isRouterMode, serverStore } from '$lib/stores/server.svelte'; import { config, settingsStore } from '$lib/stores/settings.svelte'; @@ -26,16 +23,17 @@ import { modelsStore } from '$lib/stores/models.svelte'; import { mcpStore } from '$lib/stores/mcp.svelte'; import { TOOLTIP_DELAY_DURATION } from '$lib/constants'; + import { FAVICON_PATHS, FAVICON_SELECTORS } from '$lib/constants/pwa'; import { useKeyboardShortcuts } from '$lib/hooks/use-keyboard-shortcuts.svelte'; - import { useSettingsNavigation } from '$lib/hooks/use-settings-navigation.svelte'; + import { usePwa } from '$lib/hooks/use-pwa.svelte'; import { conversations } from '$lib/stores/conversations.svelte'; import { isMobile } from '$lib/stores/viewport.svelte'; + import { theme } from '$lib/stores/theme.svelte'; + import { buildInfoStore } from '$lib/stores/build-info.svelte'; + + import { SETTINGS_KEYS } from '$lib/constants'; let { children } = $props(); - let alwaysShowSidebarOnDesktop = $derived(config().alwaysShowSidebarOnDesktop); - let isDesktop = $derived(!isMobile.current); - let sidebarOpen = $state(false); - let mounted = $state(false); let innerHeight = $state(); let innerWidth = $state(browser ? window.innerWidth : 0); @@ -46,11 +44,30 @@ } | undefined = $state(); + let showBuildVersion = $derived(config()[SETTINGS_KEYS.SHOW_BUILD_VERSION] as boolean); + let titleUpdateDialogOpen = $state(false); let titleUpdateCurrentTitle = $state(''); let titleUpdateNewTitle = $state(''); let titleUpdateResolve: ((value: boolean) => void) | null = null; - const panelNav = useSettingsNavigation(); + + // Keep the hook object intact: destructuring needRefreshByStorage reads the getter once and freezes it + const pwa = usePwa(); + const { needRefresh, updateServiceWorker } = pwa; + + function updateFavicon() { + const dark = theme.isSystemDark; + + let icoLink = document.querySelector(FAVICON_SELECTORS.ICO_48X48) as HTMLLinkElement | null; + if (icoLink) { + icoLink.href = dark ? FAVICON_PATHS.ICO_DARK : FAVICON_PATHS.ICO_LIGHT; + } + + let svgLink = document.querySelector(FAVICON_SELECTORS.SVG_ANY) as HTMLLinkElement | null; + if (svgLink) { + svgLink.href = dark ? FAVICON_PATHS.SVG_DARK : FAVICON_PATHS.SVG_LIGHT; + } + } function navigateToConversation(direction: -1 | 1) { const allConvs = conversations(); @@ -137,15 +154,23 @@ } onMount(() => { - mounted = true; + updateFavicon(); + // snapshot of every backend running stream on first load, populates the sidebar spinners + // so the user sees each conv that has a live inference, even ones not opened yet + void chatStore.syncRemoteRunningStreams(); }); + // refresh that snapshot when the tab returns to the foreground, a stream may have advanced + // or ended while it was hidden. snapshot only, no polling + function handleVisibilityChange() { + if (document.visibilityState !== 'visible') return; + void chatStore.syncRemoteRunningStreams(); + } + $effect(() => { - if (alwaysShowSidebarOnDesktop && isDesktop) { - sidebarOpen = true; + void theme.isSystemDark; - return; - } + updateFavicon(); }); // Initialize server properties on app load (run once) @@ -195,6 +220,20 @@ } }); + // Live model status and load progress via the /models/sse feed (router mode) + $effect(() => { + if (!browser) return; + if (!isRouterMode()) return; + + untrack(() => { + modelsStore.subscribeStatus(); + }); + + return () => { + modelsStore.unsubscribeStatus(); + }; + }); + // Background MCP server health checks on app load // Fetch enabled servers from settings and run health checks in background $effect(() => { @@ -236,13 +275,43 @@ + {#if pwaAssetsHead.themeColor} + + {/if} + {#if config().customCss} {/if} + + {#each pwaAssetsHead.links as link (link.href)} + + {/each} + + + + + +
+ { + if (isMobile.current) { + goto(ROUTES.SEARCH); + } else if (chatSidebar?.activateSearchMode) { + chatSidebar.activateSearchMode(); + } + }} + /> + +
+ {@render children?.()} +
+
+ + - - -
- - - {#if !(alwaysShowSidebarOnDesktop && isDesktop) && !(panelNav.isSettingsRoute && !isDesktop)} - {#if mounted} -
- -
- {/if} - {/if} - - {#if isDesktop && !alwaysShowSidebarOnDesktop} - { - if (chatSidebar?.activateSearchMode) { - chatSidebar.activateSearchMode(); - } - - sidebarOpen = true; - }} - /> - {/if} - - {@render children?.()} -
-
- + +
+ {#if showBuildVersion && buildInfoStore.value} + {buildInfoStore.value} + {/if} + + +
diff --git a/tools/ui/src/routes/search/+page.svelte b/tools/ui/src/routes/search/+page.svelte new file mode 100644 index 000000000000..d99dc1f2fab3 --- /dev/null +++ b/tools/ui/src/routes/search/+page.svelte @@ -0,0 +1,95 @@ + + + + Search · llama.cpp + + +
+ +
+ +
+ +
diff --git a/tools/ui/src/routes/settings/+layout.svelte b/tools/ui/src/routes/settings/+layout.svelte index b3b3d30e0bd7..c2b587862f25 100644 --- a/tools/ui/src/routes/settings/+layout.svelte +++ b/tools/ui/src/routes/settings/+layout.svelte @@ -27,12 +27,10 @@ } -
-
- -
+
+ +
-
- {@render children?.()} -
+
+ {@render children?.()}
diff --git a/tools/ui/src/routes/settings/[[section]]/+page.svelte b/tools/ui/src/routes/settings/[[section]]/+page.svelte index 22e727f175b1..90fe6107012f 100644 --- a/tools/ui/src/routes/settings/[[section]]/+page.svelte +++ b/tools/ui/src/routes/settings/[[section]]/+page.svelte @@ -1,12 +1,11 @@ - - - + diff --git a/tools/ui/tests/e2e/demo.test.ts b/tools/ui/tests/e2e/demo.test.ts deleted file mode 100644 index 99a3e86da3d4..000000000000 --- a/tools/ui/tests/e2e/demo.test.ts +++ /dev/null @@ -1,7 +0,0 @@ -import { expect, test } from '@playwright/test'; - -test('home page loads correctly', async ({ page }) => { - await page.goto('/'); - // Wait for the greeting to become visible (stores need time to initialize) - await expect(page.locator('h1', { hasText: /Hello there/ })).toBeVisible(); -}); diff --git a/tools/ui/tests/e2e/pwa.e2e.ts b/tools/ui/tests/e2e/pwa.e2e.ts new file mode 100644 index 000000000000..e21672239b57 --- /dev/null +++ b/tools/ui/tests/e2e/pwa.e2e.ts @@ -0,0 +1,106 @@ +import { expect, test } from '@playwright/test'; + +test.describe('PWA Service Worker', () => { + test('service worker is registered', async ({ page }) => { + await page.goto('/'); + + const swURL = await page.evaluate(async () => { + const registration = await Promise.race([ + // eslint-disable-next-line @typescript-eslint/ban-ts-comment + // @ts-ignore - type inference differs from browser runtime + navigator.serviceWorker.ready, + new Promise((_, reject) => + setTimeout(() => reject(new Error('Service worker registration failed: timeout')), 15000) + ) + ]); + // @ts-expect-error registration is of type unknown + return registration.active?.scriptURL; + }); + + expect(swURL).toBeTruthy(); + expect(swURL).toContain('/sw.js'); + }); + + test('service worker has precache configured', async ({ page }) => { + await page.goto('/'); + + await page.evaluate(async () => { + await navigator.serviceWorker.ready; + }); + + const swActive = await page.evaluate(async () => { + const reg = await navigator.serviceWorker.ready; + return reg.active?.scriptURL ?? null; + }); + + expect(swActive).toBeTruthy(); + + const swResponse = await page.request.get(swActive!); + const swContent = await swResponse.text(); + + // Precache contains SvelteKit content-hashed bundle paths + expect(swContent).toMatch(/"_app\/immutable\/bundle\.[a-zA-Z0-9_-]+\.js"/); + expect(swContent).toMatch(/"_app\/immutable\/assets\/bundle\.[a-zA-Z0-9_-]+\.css"/); + expect(swContent).toMatch(/"manifest\.webmanifest"/); + expect(swContent).toMatch(/"_app\/version\.json"/); + expect(swContent).toMatch(/NavigationRoute/); + expect(swContent).toMatch(/api-cache/); + }); + + test('offline mode - page loads when offline after caching', async ({ browser }) => { + const context = await browser.newContext(); + const offlinePage = await context.newPage(); + + await offlinePage.goto('/'); + await offlinePage.waitForLoadState('networkidle'); + + await offlinePage.evaluate(async () => { + await navigator.serviceWorker.ready; + }); + + await offlinePage.waitForTimeout(2000); + + await context.setOffline(true); + await offlinePage.goto('/'); + + const bodyText = await offlinePage.locator('body').textContent(); + expect(bodyText).toBeTruthy(); + + await context.close(); + }); + + test('version.json is accessible and contains version', async ({ page }) => { + const versionResponse = await page.request.get('/_app/version.json'); + expect(versionResponse.ok()).toBeTruthy(); + + const versionData = await versionResponse.json(); + expect(versionData).toHaveProperty('version'); + expect(typeof versionData.version).toBe('string'); + expect(versionData.version.length).toBeGreaterThan(0); + }); + + test('manifest.webmanifest is accessible and valid', async ({ page }) => { + const response = await page.request.get('/manifest.webmanifest'); + expect(response.ok()).toBeTruthy(); + + const manifest = await response.json(); + expect(manifest).toHaveProperty('name', 'llama-ui'); + expect(manifest).toHaveProperty('short_name', 'llama-ui'); + expect(manifest).toHaveProperty('start_url', './'); + expect(manifest).toHaveProperty('display', 'standalone'); + expect(manifest.icons).toBeTruthy(); + expect(manifest.icons.length).toBeGreaterThan(0); + }); + + test('index.html contains content-hashed bundle references', async ({ page }) => { + const response = await page.request.get('/'); + expect(response.ok()).toBeTruthy(); + + const html = await response.text(); + + // SvelteKit outputs content-hashed bundle names in _app/immutable/ + expect(html).toMatch(/href="(\.\/|\/)_app\/immutable\/bundle\.[a-zA-Z0-9_-]+\.js"/); + expect(html).toMatch(/href="(\.\/|\/)_app\/immutable\/assets\/bundle\.[a-zA-Z0-9_-]+\.css"/); + expect(html).toMatch(/import\("(\.\/|\/)_app\/immutable\/bundle\.[a-zA-Z0-9_-]+\.js"\)/); + }); +}); diff --git a/tools/ui/tests/stories/ModelsSelector.stories.svelte b/tools/ui/tests/stories/ModelsSelector.stories.svelte new file mode 100644 index 000000000000..3aba0dc78e45 --- /dev/null +++ b/tools/ui/tests/stories/ModelsSelector.stories.svelte @@ -0,0 +1,269 @@ + + + + + +
+ console.log('Info clicked:', modelName)} + /> +
+
+ + +
+ console.log('Info clicked:', modelName)} + /> +
+
+ + +
+ console.log('Info clicked:', modelName)} + /> +
+
+ + +
+
+ Server model states +
+ {}} + onMouseEnter={() => {}} + onKeyDown={() => {}} + /> + {}} + onMouseEnter={() => {}} + onKeyDown={() => {}} + /> + {}} + onMouseEnter={() => {}} + onKeyDown={() => {}} + /> + {}} + onMouseEnter={() => {}} + onKeyDown={() => {}} + /> + {}} + onMouseEnter={() => {}} + onKeyDown={() => {}} + /> +
+
+ + +
+
+ Selection states +
+ {}} + onMouseEnter={() => {}} + onKeyDown={() => {}} + /> + {}} + onMouseEnter={() => {}} + onKeyDown={() => {}} + /> + {}} + onMouseEnter={() => {}} + onKeyDown={() => {}} + /> + {}} + onMouseEnter={() => {}} + onKeyDown={() => {}} + /> +
+
diff --git a/tools/ui/tests/stories/PwaRefreshAlert.stories.svelte b/tools/ui/tests/stories/PwaRefreshAlert.stories.svelte new file mode 100644 index 000000000000..26aa57323264 --- /dev/null +++ b/tools/ui/tests/stories/PwaRefreshAlert.stories.svelte @@ -0,0 +1,57 @@ + + + console.log('reload') }} + play={async ({ canvas }) => { + const title = canvas.getByText('Update available'); + await expect(title).toBeInTheDocument(); + + const description = canvas.getByText(/A new version is available/); + await expect(description).toBeInTheDocument(); + + const button = canvas.getByRole('button', { name: 'Reload' }); + await expect(button).toBeInTheDocument(); + }} +/> + + console.log('reload') }} + play={async ({ canvas }) => { + const title = canvas.queryByText('Update available'); + await expect(title).not.toBeInTheDocument(); + }} +/> + + console.log('reload') + }} + play={async ({ canvas, userEvent }) => { + const button = canvas.getByRole('button', { name: 'Reload' }); + await expect(button).toBeInTheDocument(); + + await userEvent.click(button); + + const title = canvas.queryByText('Update available'); + await expect(title).not.toBeInTheDocument(); + + const reloadBtn = canvas.queryByRole('button', { name: 'Reload' }); + await expect(reloadBtn).not.toBeInTheDocument(); + }} +/> diff --git a/tools/ui/tests/stories/SidebarNavigation.stories.svelte b/tools/ui/tests/stories/SidebarNavigation.stories.svelte index aae42f2a053c..7fbcef3ed5bd 100644 --- a/tools/ui/tests/stories/SidebarNavigation.stories.svelte +++ b/tools/ui/tests/stories/SidebarNavigation.stories.svelte @@ -14,10 +14,6 @@