From 1b10b28cd6774f93d364de82f155c308746f65b0 Mon Sep 17 00:00:00 2001
From: oscar1229 <2252289184@qq.com>
Date: Fri, 3 Jul 2026 14:16:45 +0800
Subject: [PATCH] fix(spacemit): fix MiniCPM-V SMT multimodal inference on
 RISC-V

Fixes three independent bugs that prevented MiniCPM-V from running via
the SMT media backend with multi-threaded warmup and multi-turn image
conversations.

* fix(spacemit): fix IME paired-lane GEMM threadpool deadlock

  The IME GEMM kernels (forward_mul_mat and the mul_mat_id MoE path)
  rendezvous thread pairs (2k, 2k+1) on a spine_barrier built for two
  participants, so both lanes must call spine_barrier_wait() the same
  number of times. The old per-thread loop could iterate a different
  number of times per lane when gemm_n was not a multiple of
  NB_COLS*nth, and the trailing even thread on odd nth had no partner,
  so warmup hung with -t 8. Drive the loop from a pair-aligned base
  with a per-lane offset (both lanes always iterate equally; an
  out-of-range lane skips the GEMM but still hits the barrier) and
  guard the barrier with has_pair so a partnerless thread never waits.

* server: force full re-prefill for multimodal FULL-only KV cache reuse

  MiniCPM-V runs on the qwen35 hybrid (SSM + periodic full-attention)
  backend whose KV memory only supports full sequence removal. On a
  multi-turn request, partial prompt-cache reuse would either restore a
  context checkpoint (resurrecting a KV state inconsistent with the
  external smt/ONNX vision embeddings) or call partial memory_seq_rm on
  FULL-only memory, which returns false and triggers GGML_ABORT. When
  the context is multimodal and the reused prefix is partial, force a
  full re-prefill (pos_next = 0, n_past = 0) before the checkpoint /
  seq_rm path. Pure-append turns and non-multimodal contexts are
  unaffected.

* feat(mtmd): add MiniCPM-V SMT vision preprocessing

  The MiniCPM-V SMT vision ONNX export does not normalize pixels
  internally. Detect minicpmv / minicpm_v / minicpm-v architectures and
  route them through rgb_u8_to_chw_f32_with_config, which reads
  rescale_factor / image_mean / image_std from config.json's
  vision_preprocess block and emits a CHW float32 tensor. Target
  defaults to 448x448, overridable via vision_model.input_width/height.
---
 ggml/src/ggml-cpu/spacemit/ime.cpp   | 71 ++++++++++++++++++----------
 tools/mtmd/smt-vision-preprocess.cpp |  7 +++
 tools/server/server-context.cpp      | 19 ++++++--
 3 files changed, 68 insertions(+), 29 deletions(-)
diff --git a/ggml/src/ggml-cpu/spacemit/ime.cpp b/ggml/src/ggml-cpu/spacemit/ime.cpp
index 50e137b50af9..d474549ee913 100644
--- a/ggml/src/ggml-cpu/spacemit/ime.cpp
+++ b/ggml/src/ggml-cpu/spacemit/ime.cpp
@@ -439,55 +439,70 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS> class tensor_
             }
             uint8_t * b_col_zp = block_type_has_zp<BLOC_TYPE>() ? b_col : nullptr;
 
-            int64_t ni      = ith * NB_COLS;
-            int64_t nb_real = std::min(gemm_n - ni, NB_COLS);
+            // Pair-barrier lockstep: an even thread with no odd partner (odd nth) must not wait.
+            const bool has_pair = ((ith & 1) != 0) || (ith + 1 < nth);
 
-            if (ith % 2 == 0 && nb_real > 0) {
-                spacemit_kernels::rvv::memcpy1d(b_col, reinterpret_cast<uint8_t *>(w_data) + ni * row_stride_b,
-                                                nb_real * row_stride_b);
+            const int64_t ni0     = (int64_t) ith * NB_COLS;
+            const int64_t nb0     = std::min(gemm_n - ni0, (int64_t) NB_COLS);
+            const bool    active0 = nb0 > 0;
+
+            if (ith % 2 == 0 && active0) {
+                spacemit_kernels::rvv::memcpy1d(b_col, reinterpret_cast<uint8_t *>(w_data) + ni0 * row_stride_b,
+                                                nb0 * row_stride_b);
                 if (a_row != quant_a_buffer) {
                     spacemit_kernels::rvv::memcpy1d(a_row, quant_a_buffer, gemm_workspace_size);
                 }
             }
 
-            spine_barrier_wait(cur_barrier);
+            if (has_pair) {
+                spine_barrier_wait(cur_barrier);
+            }
 
-            if (ith % 2 != 0 && nb_real > 0) {
+            if (ith % 2 != 0 && active0) {
                 if (a_row != quant_a_buffer) {
                     spacemit_kernels::rvv::memcpy1d(a_row, quant_a_buffer, gemm_workspace_size);
                 }
-                spacemit_kernels::rvv::memcpy1d(b_col, reinterpret_cast<uint8_t *>(w_data) + ni * row_stride_b,
-                                                nb_real * row_stride_b);
+                spacemit_kernels::rvv::memcpy1d(b_col, reinterpret_cast<uint8_t *>(w_data) + ni0 * row_stride_b,
+                                                nb0 * row_stride_b);
             }
 
-            for (; ni < gemm_n; ni += NB_COLS * nth) {
-                int64_t rows_remaining = gemm_m;
-                float * c_blk          = output + ni;
-                auto *  a_row_cur      = a_row;
+            // Iterate over the even lane's column base so both lanes of a pair run in lockstep.
+            const int64_t base_start = (int64_t) (ith & ~1) * NB_COLS;
+            const int64_t lane_off   = (int64_t) (ith & 1) * NB_COLS;
+            for (int64_t base = base_start; base < gemm_n; base += NB_COLS * nth) {
+                const int64_t ni     = base + lane_off;
+                const bool    active = ni < gemm_n;
+                const int64_t nb_real = active ? std::min(gemm_n - ni, (int64_t) NB_COLS) : 0;
 
-                if (ith % 2 != 0) {
+                if (has_pair && ith % 2 != 0) {
                     spine_barrier_wait(cur_barrier);
                 }
 
-                while (rows_remaining > 0) {
-                    auto rows_handled = gemm_kernel(b_blk_len, a_row_cur, b_col, b_col_zp, c_blk, rows_remaining,
-                                                    nb_real, b_k_blks, gemm_n);
+                if (active) {
+                    int64_t rows_remaining = gemm_m;
+                    float * c_blk          = output + ni;
+                    auto *  a_row_cur      = a_row;
 
-                    c_blk += rows_handled * gemm_n;
-                    a_row_cur += rows_handled * row_stride_a;
+                    while (rows_remaining > 0) {
+                        auto rows_handled = gemm_kernel(b_blk_len, a_row_cur, b_col, b_col_zp, c_blk, rows_remaining,
+                                                        nb_real, b_k_blks, gemm_n);
+
+                        c_blk += rows_handled * gemm_n;
+                        a_row_cur += rows_handled * row_stride_a;
 
-                    rows_remaining -= rows_handled;
+                        rows_remaining -= rows_handled;
+                    }
                 }
 
-                if (ith % 2 == 0) {
+                if (has_pair && ith % 2 == 0) {
                     spine_barrier_wait(cur_barrier);
                 }
 
                 const int64_t next_ni = ni + NB_COLS * nth;
                 if (next_ni < gemm_n) {
-                    nb_real = std::min(gemm_n - next_ni, NB_COLS);
+                    const int64_t next_nb = std::min(gemm_n - next_ni, (int64_t) NB_COLS);
                     spacemit_kernels::rvv::memcpy1d(b_col, reinterpret_cast<uint8_t *>(w_data) + next_ni * row_stride_b,
-                                                    nb_real * row_stride_b);
+                                                    next_nb * row_stride_b);
                 }
             }
         } else {
@@ -725,6 +740,8 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS> class tensor_
 
         if (valid_ep_count_t % nth == 0 && tcm_buffer != nullptr && valid_ep_count_t == n_as &&
             valid_act_count_t == n_as && per_nb_cols_wsize <= tcm_buffer_size) {
+            // Pair-barrier lockstep: an even thread with no odd partner (odd nth) must not wait.
+            const bool has_pair = ((ith & 1) != 0) || (ith + 1 < nth);
             for (int64_t valid_id = ith; valid_id < valid_ep_count_t; valid_id += nth) {
                 const int64_t cur_a = valid_matrix_row_counts[valid_id];
 
@@ -756,7 +773,9 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS> class tensor_
                     }
                 }
 
-                spine_barrier_wait(cur_barrier);
+                if (has_pair) {
+                    spine_barrier_wait(cur_barrier);
+                }
 
                 if (ith % 2 != 0) {
                     if (a_row != src1_col) {
@@ -768,13 +787,13 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS> class tensor_
 
                 int64_t nb_real = std::min(ne01, NB_COLS);
                 for (int64_t ni = 0; ni < ne01; ni += NB_COLS) {
-                    if (ith % 2 != 0) {
+                    if (has_pair && ith % 2 != 0) {
                         spine_barrier_wait(cur_barrier);
                     }
 
                     gemm_kernel(b_blk_len, a_row, b_col, b_col_zp, c_blk + ni, 1, nb_real, b_k_blks, ne01);
 
-                    if (ith % 2 == 0) {
+                    if (has_pair && ith % 2 == 0) {
                         spine_barrier_wait(cur_barrier);
                     }
 
diff --git a/tools/mtmd/smt-vision-preprocess.cpp b/tools/mtmd/smt-vision-preprocess.cpp
index 4e80d9e84238..c86871bb2fde 100644
--- a/tools/mtmd/smt-vision-preprocess.cpp
+++ b/tools/mtmd/smt-vision-preprocess.cpp
@@ -52,6 +52,13 @@ static ep_preproc_spec resolve_preproc_spec(const std::string & architecture) {
                  /* apply */ false };
     }
 
+    // MiniCPM-V SMT ONNX applies rescale/mean/std from config.json (no internal normalization).
+    if (contains_icase(architecture, "minicpmv") || contains_icase(architecture, "minicpm_v") ||
+        contains_icase(architecture, "minicpm-v")) {
+        return { /* target_w */ 448, /* target_h */ 448, /* normalize_to_01 */ false, /* quantize */ true,
+                 /* apply */ true };
+    }
+
     if (contains_icase(architecture, "paddleocr")) {
         return { /* target_w */ 0, /* target_h */ 0, /* normalize_to_01 */ false, /* quantize */ true,
                  /* apply */ true };
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index eec80696ad3f..b5eaab05a327 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -2990,9 +2990,22 @@ struct server_context_impl {
                             // the largest pos_min required for a checkpoint to be useful
                             const auto pos_min_thold = std::max(0, pos_next - n_swa - 1);
 
-                            // note: disallow with multimodal contexts for now
-                            //       https://github.com/ggml-org/llama.cpp/issues/17043
-                            if (!slot.prompt.tokens.has_mtmd && n_past > 0 && n_past <= slot.prompt.n_tokens()) {
+                            // Multimodal: force full re-prefill instead of checkpoint restore or
+                            // partial seq_rm (vision state is external; KV backend may be FULL-only).
+                            // https://github.com/ggml-org/llama.cpp/issues/17043
+                            if (slot.prompt.tokens.has_mtmd &&
+                                n_past > 0 && n_past < slot.prompt.n_tokens()) {
+                                SLT_WRN(slot,
+                                        "forcing full prompt re-processing for multimodal context "
+                                        "(cannot restore vision state from checkpoint / partial "
+                                        "sequence removal unsupported); dropping %d cached tokens "
+                                        "(n_past %d -> 0)\n",
+                                        (int) slot.prompt.n_tokens() - n_past, n_past);
+                                pos_next = 0;
+                                n_past   = 0;
+                            }
+
+                            if (!slot.prompt.tokens.has_mtmd && n_past > 0 && n_past < slot.prompt.n_tokens()) {
                                 const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx_tgt), slot.id);
                                 if (pos_min == -1) {
                                     SLT_ERR(slot,