From 1b10b28cd6774f93d364de82f155c308746f65b0 Mon Sep 17 00:00:00 2001 From: oscar1229 <2252289184@qq.com> Date: Fri, 3 Jul 2026 14:16:45 +0800 Subject: [PATCH] fix(spacemit): fix MiniCPM-V SMT multimodal inference on RISC-V Fixes three independent bugs that prevented MiniCPM-V from running via the SMT media backend with multi-threaded warmup and multi-turn image conversations. * fix(spacemit): fix IME paired-lane GEMM threadpool deadlock The IME GEMM kernels (forward_mul_mat and the mul_mat_id MoE path) rendezvous thread pairs (2k, 2k+1) on a spine_barrier built for two participants, so both lanes must call spine_barrier_wait() the same number of times. The old per-thread loop could iterate a different number of times per lane when gemm_n was not a multiple of NB_COLS*nth, and the trailing even thread on odd nth had no partner, so warmup hung with -t 8. Drive the loop from a pair-aligned base with a per-lane offset (both lanes always iterate equally; an out-of-range lane skips the GEMM but still hits the barrier) and guard the barrier with has_pair so a partnerless thread never waits. * server: force full re-prefill for multimodal FULL-only KV cache reuse MiniCPM-V runs on the qwen35 hybrid (SSM + periodic full-attention) backend whose KV memory only supports full sequence removal. On a multi-turn request, partial prompt-cache reuse would either restore a context checkpoint (resurrecting a KV state inconsistent with the external smt/ONNX vision embeddings) or call partial memory_seq_rm on FULL-only memory, which returns false and triggers GGML_ABORT. When the context is multimodal and the reused prefix is partial, force a full re-prefill (pos_next = 0, n_past = 0) before the checkpoint / seq_rm path. Pure-append turns and non-multimodal contexts are unaffected. * feat(mtmd): add MiniCPM-V SMT vision preprocessing The MiniCPM-V SMT vision ONNX export does not normalize pixels internally. Detect minicpmv / minicpm_v / minicpm-v architectures and route them through rgb_u8_to_chw_f32_with_config, which reads rescale_factor / image_mean / image_std from config.json's vision_preprocess block and emits a CHW float32 tensor. Target defaults to 448x448, overridable via vision_model.input_width/height. --- ggml/src/ggml-cpu/spacemit/ime.cpp | 71 ++++++++++++++++++---------- tools/mtmd/smt-vision-preprocess.cpp | 7 +++ tools/server/server-context.cpp | 19 ++++++-- 3 files changed, 68 insertions(+), 29 deletions(-) diff --git a/ggml/src/ggml-cpu/spacemit/ime.cpp b/ggml/src/ggml-cpu/spacemit/ime.cpp index 50e137b50af9..d474549ee913 100644 --- a/ggml/src/ggml-cpu/spacemit/ime.cpp +++ b/ggml/src/ggml-cpu/spacemit/ime.cpp @@ -439,55 +439,70 @@ template class tensor_ } uint8_t * b_col_zp = block_type_has_zp() ? b_col : nullptr; - int64_t ni = ith * NB_COLS; - int64_t nb_real = std::min(gemm_n - ni, NB_COLS); + // Pair-barrier lockstep: an even thread with no odd partner (odd nth) must not wait. + const bool has_pair = ((ith & 1) != 0) || (ith + 1 < nth); - if (ith % 2 == 0 && nb_real > 0) { - spacemit_kernels::rvv::memcpy1d(b_col, reinterpret_cast(w_data) + ni * row_stride_b, - nb_real * row_stride_b); + const int64_t ni0 = (int64_t) ith * NB_COLS; + const int64_t nb0 = std::min(gemm_n - ni0, (int64_t) NB_COLS); + const bool active0 = nb0 > 0; + + if (ith % 2 == 0 && active0) { + spacemit_kernels::rvv::memcpy1d(b_col, reinterpret_cast(w_data) + ni0 * row_stride_b, + nb0 * row_stride_b); if (a_row != quant_a_buffer) { spacemit_kernels::rvv::memcpy1d(a_row, quant_a_buffer, gemm_workspace_size); } } - spine_barrier_wait(cur_barrier); + if (has_pair) { + spine_barrier_wait(cur_barrier); + } - if (ith % 2 != 0 && nb_real > 0) { + if (ith % 2 != 0 && active0) { if (a_row != quant_a_buffer) { spacemit_kernels::rvv::memcpy1d(a_row, quant_a_buffer, gemm_workspace_size); } - spacemit_kernels::rvv::memcpy1d(b_col, reinterpret_cast(w_data) + ni * row_stride_b, - nb_real * row_stride_b); + spacemit_kernels::rvv::memcpy1d(b_col, reinterpret_cast(w_data) + ni0 * row_stride_b, + nb0 * row_stride_b); } - for (; ni < gemm_n; ni += NB_COLS * nth) { - int64_t rows_remaining = gemm_m; - float * c_blk = output + ni; - auto * a_row_cur = a_row; + // Iterate over the even lane's column base so both lanes of a pair run in lockstep. + const int64_t base_start = (int64_t) (ith & ~1) * NB_COLS; + const int64_t lane_off = (int64_t) (ith & 1) * NB_COLS; + for (int64_t base = base_start; base < gemm_n; base += NB_COLS * nth) { + const int64_t ni = base + lane_off; + const bool active = ni < gemm_n; + const int64_t nb_real = active ? std::min(gemm_n - ni, (int64_t) NB_COLS) : 0; - if (ith % 2 != 0) { + if (has_pair && ith % 2 != 0) { spine_barrier_wait(cur_barrier); } - while (rows_remaining > 0) { - auto rows_handled = gemm_kernel(b_blk_len, a_row_cur, b_col, b_col_zp, c_blk, rows_remaining, - nb_real, b_k_blks, gemm_n); + if (active) { + int64_t rows_remaining = gemm_m; + float * c_blk = output + ni; + auto * a_row_cur = a_row; - c_blk += rows_handled * gemm_n; - a_row_cur += rows_handled * row_stride_a; + while (rows_remaining > 0) { + auto rows_handled = gemm_kernel(b_blk_len, a_row_cur, b_col, b_col_zp, c_blk, rows_remaining, + nb_real, b_k_blks, gemm_n); + + c_blk += rows_handled * gemm_n; + a_row_cur += rows_handled * row_stride_a; - rows_remaining -= rows_handled; + rows_remaining -= rows_handled; + } } - if (ith % 2 == 0) { + if (has_pair && ith % 2 == 0) { spine_barrier_wait(cur_barrier); } const int64_t next_ni = ni + NB_COLS * nth; if (next_ni < gemm_n) { - nb_real = std::min(gemm_n - next_ni, NB_COLS); + const int64_t next_nb = std::min(gemm_n - next_ni, (int64_t) NB_COLS); spacemit_kernels::rvv::memcpy1d(b_col, reinterpret_cast(w_data) + next_ni * row_stride_b, - nb_real * row_stride_b); + next_nb * row_stride_b); } } } else { @@ -725,6 +740,8 @@ template class tensor_ if (valid_ep_count_t % nth == 0 && tcm_buffer != nullptr && valid_ep_count_t == n_as && valid_act_count_t == n_as && per_nb_cols_wsize <= tcm_buffer_size) { + // Pair-barrier lockstep: an even thread with no odd partner (odd nth) must not wait. + const bool has_pair = ((ith & 1) != 0) || (ith + 1 < nth); for (int64_t valid_id = ith; valid_id < valid_ep_count_t; valid_id += nth) { const int64_t cur_a = valid_matrix_row_counts[valid_id]; @@ -756,7 +773,9 @@ template class tensor_ } } - spine_barrier_wait(cur_barrier); + if (has_pair) { + spine_barrier_wait(cur_barrier); + } if (ith % 2 != 0) { if (a_row != src1_col) { @@ -768,13 +787,13 @@ template class tensor_ int64_t nb_real = std::min(ne01, NB_COLS); for (int64_t ni = 0; ni < ne01; ni += NB_COLS) { - if (ith % 2 != 0) { + if (has_pair && ith % 2 != 0) { spine_barrier_wait(cur_barrier); } gemm_kernel(b_blk_len, a_row, b_col, b_col_zp, c_blk + ni, 1, nb_real, b_k_blks, ne01); - if (ith % 2 == 0) { + if (has_pair && ith % 2 == 0) { spine_barrier_wait(cur_barrier); } diff --git a/tools/mtmd/smt-vision-preprocess.cpp b/tools/mtmd/smt-vision-preprocess.cpp index 4e80d9e84238..c86871bb2fde 100644 --- a/tools/mtmd/smt-vision-preprocess.cpp +++ b/tools/mtmd/smt-vision-preprocess.cpp @@ -52,6 +52,13 @@ static ep_preproc_spec resolve_preproc_spec(const std::string & architecture) { /* apply */ false }; } + // MiniCPM-V SMT ONNX applies rescale/mean/std from config.json (no internal normalization). + if (contains_icase(architecture, "minicpmv") || contains_icase(architecture, "minicpm_v") || + contains_icase(architecture, "minicpm-v")) { + return { /* target_w */ 448, /* target_h */ 448, /* normalize_to_01 */ false, /* quantize */ true, + /* apply */ true }; + } + if (contains_icase(architecture, "paddleocr")) { return { /* target_w */ 0, /* target_h */ 0, /* normalize_to_01 */ false, /* quantize */ true, /* apply */ true }; diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index eec80696ad3f..b5eaab05a327 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -2990,9 +2990,22 @@ struct server_context_impl { // the largest pos_min required for a checkpoint to be useful const auto pos_min_thold = std::max(0, pos_next - n_swa - 1); - // note: disallow with multimodal contexts for now - // https://github.com/ggml-org/llama.cpp/issues/17043 - if (!slot.prompt.tokens.has_mtmd && n_past > 0 && n_past <= slot.prompt.n_tokens()) { + // Multimodal: force full re-prefill instead of checkpoint restore or + // partial seq_rm (vision state is external; KV backend may be FULL-only). + // https://github.com/ggml-org/llama.cpp/issues/17043 + if (slot.prompt.tokens.has_mtmd && + n_past > 0 && n_past < slot.prompt.n_tokens()) { + SLT_WRN(slot, + "forcing full prompt re-processing for multimodal context " + "(cannot restore vision state from checkpoint / partial " + "sequence removal unsupported); dropping %d cached tokens " + "(n_past %d -> 0)\n", + (int) slot.prompt.n_tokens() - n_past, n_past); + pos_next = 0; + n_past = 0; + } + + if (!slot.prompt.tokens.has_mtmd && n_past > 0 && n_past < slot.prompt.n_tokens()) { const auto pos_min = llama_memory_seq_pos_min(llama_get_memory(ctx_tgt), slot.id); if (pos_min == -1) { SLT_ERR(slot,