From 38118a679474165a8e1a0172decb27aa208e19c2 Mon Sep 17 00:00:00 2001 From: Lorenzo Fiore Date: Tue, 9 Jun 2026 23:04:23 +0200 Subject: [PATCH 1/3] feat(dl): route CPU-backend access through ggml registry for Windows DL Under GGML_BACKEND_DL the CPU backend is a dlopen'd module, so the parakeet/ggml core can no longer reference ggml_backend_cpu_init / _is_cpu / _set_n_threads / cpu_buffer_from_ptr directly. macOS/Linux papered over this with -undefined dynamic_lookup / --allow-shlib-undefined + an RTLD_GLOBAL ggml patch, but MSVC has no equivalent and rejects the unresolved symbols at DLL link time (LNK2019), so Windows DL was deferred. Extend the vendored parakeet patch to obtain CPU access via the ggml device registry (in ggml-base, always linked) instead: - ggml_backend_cpu_init() -> ggml_backend_dev_init( ggml_backend_dev_by_type(CPU), NULL) - ggml_backend_is_cpu(b) -> ggml_backend_dev_type( ggml_backend_get_device(b)) == CPU - ggml_backend_cpu_set_n_threads -> ggml_backend_reg_get_proc_address( reg, "ggml_backend_set_n_threads") (the llama.cpp DL pattern) - ggml_backend_cpu_buffer_from_ptr -> ggml_backend_dev_buffer_from_host_ptr (maps to the same zero-copy CPU buffer) Single path for both static and DL: the registry is populated in both modes (static via GGML_USE_CPU; DL via the existing ggml_backend_load_all in global_backend(), which runs before any Backend is constructed), so no #ifdef GGML_BACKEND_DL is needed. Drops the now-unused ggml-cpu.h include. Add the windows-latest DL leg to CI and remove the "deferred" note. Validated on macOS: static build, DL build, and the DL Metal test (dl_backend_is_metal -> MTL0, real transcribe) all pass. --- .github/workflows/ci.yml | 13 +- .../patches/parakeet/0001-backend-dl.patch | 138 ++++++++++++++++++ 2 files changed, 146 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 772ef05..680479b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -39,11 +39,14 @@ jobs: - os: ubuntu-latest kind: dl features: "--features dynamic-backends,vulkan" - # NOTE: Windows DL is intentionally NOT built yet. MSVC cannot defer the - # direct ggml_backend_cpu_init/_set_n_threads symbol references at DLL link - # time (no `-undefined dynamic_lookup` equivalent → LNK2019). A follow-up - # parakeet patch must route CPU-backend access through the ggml registry - # before Windows DL can link. Windows is not functional yet anyway. + # Windows: DL + Vulkan. The parakeet patch routes all CPU-backend access + # through the ggml device registry (no direct ggml_backend_cpu_* symbol + # references), so the shared parakeet/ggml core links cleanly under MSVC — + # no `-undefined dynamic_lookup` equivalent needed. The CPU + Vulkan + # backends ship as loadable modules, dlopen'd at runtime. + - os: windows-latest + kind: dl + features: "--features dynamic-backends,vulkan" env: # Pin a known-good LunarG SDK version for Windows (see llama.cpp CI). VULKAN_VERSION: "1.4.313.2" diff --git a/parakeet-cpp-sys/patches/parakeet/0001-backend-dl.patch b/parakeet-cpp-sys/patches/parakeet/0001-backend-dl.patch index cb8a949..fba52f1 100644 --- a/parakeet-cpp-sys/patches/parakeet/0001-backend-dl.patch +++ b/parakeet-cpp-sys/patches/parakeet/0001-backend-dl.patch @@ -58,6 +58,97 @@ index b082455..de21cd2 100644 #ifdef __cplusplus } // extern "C" #endif +diff --git a/src/backend.cpp b/src/backend.cpp +index 40055f9..03e2a56 100644 +--- a/src/backend.cpp ++++ b/src/backend.cpp +@@ -6,7 +6,6 @@ + #include "ggml.h" + #include "ggml-alloc.h" + #include "ggml-backend.h" +-#include "ggml-cpu.h" + + #include + #include +@@ -45,6 +44,45 @@ struct PendingCapture { + ggml_tensor* tensor; + std::vector* dst; + }; ++ ++// CPU-backend access routed through the ggml device registry instead of the ++// CPU-module symbols (ggml_backend_cpu_init / _is_cpu / _set_n_threads). Under ++// GGML_BACKEND_DL the CPU backend is a dlopen'd module, so its symbols are NOT ++// resolvable at link time (MSVC rejects this with LNK2019); the registry API ++// (in ggml-base, always linked) is. The registry is populated for BOTH the ++// static build (GGML_USE_CPU registers the CPU device at startup) and the DL ++// build (global_backend() runs ggml_backend_load_all before any Backend is ++// constructed), so this single path works for both — no #ifdef needed. ++ ++// Create a CPU backend via the registry: look up the CPU device, then init it. ++// Returns nullptr if no CPU device is registered. ++ggml_backend_t cpu_backend_init() { ++ ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); ++ return dev ? ggml_backend_dev_init(dev, nullptr) : nullptr; ++} ++ ++// True if `backend` is a CPU-type device (registry replacement for ++// ggml_backend_is_cpu). ++bool backend_is_cpu(ggml_backend_t backend) { ++ if (!backend) return false; ++ ggml_backend_dev_t dev = ggml_backend_get_device(backend); ++ return dev && ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU; ++} ++ ++// Set the thread count on a CPU backend without linking the CPU-module symbol: ++// fetch ggml_backend_set_n_threads from the backend's registry entry via ++// proc-address (the standard ggml way for dynamically-loaded backends — this is ++// exactly what llama.cpp does). No-op if the backend/device/reg/proc is absent. ++void backend_set_n_threads(ggml_backend_t backend, int n_threads) { ++ if (!backend) return; ++ ggml_backend_dev_t dev = ggml_backend_get_device(backend); ++ if (!dev) return; ++ ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev); ++ if (!reg) return; ++ auto fn = (ggml_backend_set_n_threads_t)ggml_backend_reg_get_proc_address( ++ reg, "ggml_backend_set_n_threads"); ++ if (fn) fn(backend, n_threads); ++} + } // namespace + + struct Backend::Impl { +@@ -124,7 +162,7 @@ Backend::Backend(int n_threads) : impl_(new Impl()) { + want.c_str()); + } + if (!impl_->backend) { // CPU fallback (or CPU-only build) +- impl_->backend = ggml_backend_cpu_init(); ++ impl_->backend = cpu_backend_init(); + device_name_ = "cpu"; + } + if (!impl_->backend) { +@@ -136,7 +174,7 @@ Backend::Backend(int n_threads) : impl_(new Impl()) { + // instead of aborting. The CPU/single-backend path keeps using the persistent + // gallocr below and is untouched. + if (impl_->use_sched) { +- impl_->cpu_backend = ggml_backend_cpu_init(); ++ impl_->cpu_backend = cpu_backend_init(); + if (!impl_->cpu_backend) { + PK_LOG("pk::Backend: CPU fallback init failed; disabling sched"); + impl_->use_sched = false; +@@ -159,11 +197,11 @@ Backend::~Backend() { + + void Backend::set_n_threads(int n_threads) { + n_threads_ = n_threads > 0 ? n_threads : 1; +- if (impl_ && impl_->backend && ggml_backend_is_cpu(impl_->backend)) { +- ggml_backend_cpu_set_n_threads(impl_->backend, n_threads_); ++ if (impl_ && impl_->backend && backend_is_cpu(impl_->backend)) { ++ backend_set_n_threads(impl_->backend, n_threads_); + } + if (impl_ && impl_->cpu_backend) { +- ggml_backend_cpu_set_n_threads(impl_->cpu_backend, n_threads_); ++ backend_set_n_threads(impl_->cpu_backend, n_threads_); + } + } + diff --git a/src/ggml_graph.cpp b/src/ggml_graph.cpp index f5bf84a..6b9b45c 100644 --- a/src/ggml_graph.cpp @@ -94,6 +185,53 @@ index f5bf84a..6b9b45c 100644 // Lazy create (reset-safe: shutdown_backend() can free it, and a later call // recreates it). Always reached under g_backend_mutex (run_graph holds it) // or before any inference thread exists, so a plain null-check is sufficient. +diff --git a/src/model_loader.cpp b/src/model_loader.cpp +index a1a0d66..b138216 100644 +--- a/src/model_loader.cpp ++++ b/src/model_loader.cpp +@@ -3,8 +3,8 @@ + #include "ggml.h" + #include "ggml-backend.h" + #include "ggml-alloc.h" +-#include "ggml-cpu.h" + #include "gguf.h" ++#include + #include + #include + #include +@@ -73,18 +73,25 @@ bool ModelLoader::realize_weights(ggml_backend_t backend){ + if(weights_buf_) return true; // idempotent + if(!backend || !ctx_){ PK_LOG("realize_weights: null backend/ctx"); return false; } + +- if (ggml_backend_is_cpu(backend)) { ++ // CPU access is routed through the ggml device registry rather than the ++ // CPU-module symbols (ggml_backend_is_cpu / ggml_backend_cpu_buffer_from_ptr), ++ // so this links under GGML_BACKEND_DL (CPU is a dlopen'd module whose symbols ++ // are not available at link time) as well as in the static build. The CPU ++ // device's buffer_from_host_ptr maps to ggml_backend_cpu_buffer_from_ptr, so ++ // the zero-copy semantics below are unchanged. ++ ggml_backend_dev_t dev = ggml_backend_get_device(backend); ++ if (dev && ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) { + // Fast path: borrow the host ctx memory directly (no copy). + // The GGUF is loaded with no_alloc=false, so every tensor's data lives + // in one contiguous ctx mem_buffer. Wrap that exact memory as a CPU +- // backend buffer (zero-copy: ggml_backend_cpu_buffer_from_ptr borrows +- // the ptr) and point every tensor's ->buffer at it, so graphs can +- // reference the loader tensors DIRECTLY as leaves (the gallocr treats +- // data!=NULL tensors as already-allocated and never copies them; +- // reshapes/views resolve at build time). Eliminates per-call recopy. ++ // backend buffer (zero-copy: buffer_from_host_ptr borrows the ptr) and ++ // point every tensor's ->buffer at it, so graphs can reference the loader ++ // tensors DIRECTLY as leaves (the gallocr treats data!=NULL tensors as ++ // already-allocated and never copies them; reshapes/views resolve at ++ // build time). Eliminates per-call recopy. + void* base = ggml_get_mem_buffer(ctx_); + size_t size = ggml_get_mem_size(ctx_); +- weights_buf_ = ggml_backend_cpu_buffer_from_ptr(base, size); ++ weights_buf_ = ggml_backend_dev_buffer_from_host_ptr(dev, base, size, SIZE_MAX); + if(!weights_buf_){ PK_LOG("realize_weights: buffer_from_ptr failed"); return false; } + for(auto& kv : tensors_) kv.second->buffer = weights_buf_; + return true; diff --git a/src/parakeet_capi.cpp b/src/parakeet_capi.cpp index 01de213..1ddc2f0 100644 --- a/src/parakeet_capi.cpp From bd69c9cf4d6d64646619902333f85ac89b0311ee Mon Sep 17 00:00:00 2001 From: Lorenzo Fiore Date: Tue, 9 Jun 2026 23:17:26 +0200 Subject: [PATCH 2/3] fix(dl): find parakeet import lib in build tree for Windows DL link parakeet.cpp has no install() rule, so the parakeet library only exists in the cmake build tree, never the install prefix. Under DL on Windows the SHARED parakeet target produces parakeet.dll + an MSVC import parakeet.lib; ggml redirects DLLs to /bin (CMAKE_RUNTIME_OUTPUT_DIRECTORY, a dir-scoped var that doesn't reach the parent parakeet scope) and the import .lib lands in a generator-dependent spot the fixed lib_dirs list didn't cover -> the consumer test exe link failed with LNK1181: cannot open input file 'parakeet.lib'. (The lib `cargo build` passed because an rlib doesn't link; only `cargo test`, which builds executables, surfaced it.) Walk the whole build tree and add every dir holding a linkable artifact (.lib/.a/.dll/.so/.dylib) to the link-search path, so parakeet.lib is found regardless of generator/platform layout. The install `lib/` dir stays first in search order, so the linked ggml/ggml-base resolve to the install copies; the extra dirs only add the otherwise-uninstalled parakeet library. macOS static + DL builds and the DL Metal test still pass. --- parakeet-cpp-sys/build.rs | 50 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) diff --git a/parakeet-cpp-sys/build.rs b/parakeet-cpp-sys/build.rs index 491a475..1dca21a 100644 --- a/parakeet-cpp-sys/build.rs +++ b/parakeet-cpp-sys/build.rs @@ -97,7 +97,7 @@ fn main() { // backend MODULES land in `bin/`. The static build keeps everything as // `lib*.a` / `*.lib` across `lib/` + the build tree (+ `Release/` on the // Windows multi-config generators). - let lib_dirs = [ + let mut lib_dirs = vec![ dst.join("lib"), dst.join("bin"), dst.join("build"), @@ -105,6 +105,20 @@ fn main() { dst.join("bin").join("Release"), dst.join("build").join("Release"), ]; + // parakeet.cpp has NO install() rule, so the parakeet library (static `.lib`/ + // `.a`, or — under DL — the SHARED `.dll`/.so/.dylib + its MSVC import `.lib`) + // is never copied to the install prefix; it only exists somewhere in the build + // tree. ggml redirects DLLs to `/bin` via CMAKE_RUNTIME_OUTPUT_DIRECTORY + // (a directory-scoped var that does NOT propagate to the parent parakeet + // scope), and the Ninja vs multi-config generators differ on where the import + // `.lib` lands. Rather than enumerate every layout, walk the whole build tree + // and add every dir that holds a linkable artifact, so `parakeet.lib` (the DL + // import lib that the consumer exe links) is found regardless of generator. + for dir in find_link_dirs(&dst.join("build")) { + if !lib_dirs.contains(&dir) { + lib_dirs.push(dir); + } + } for dir in &lib_dirs { println!("cargo:rustc-link-search=native={}", dir.display()); } @@ -294,6 +308,40 @@ fn apply_patches(dir: &std::path::Path, root: &std::path::Path) { } } +/// Recursively collect every directory under `root` that contains at least one +/// linkable artifact (`.lib` / `.a` / `.dll` / `.so` / `.dylib`). Used to find +/// the parakeet library in the build tree (it has no install rule, and its exact +/// location varies by generator/platform). Returns an empty vec if `root` is +/// absent. Directories are returned deepest-first is NOT guaranteed; order is +/// irrelevant for link-search. +fn find_link_dirs(root: &std::path::Path) -> Vec { + fn is_linkable(p: &std::path::Path) -> bool { + p.extension() + .and_then(|x| x.to_str()) + .is_some_and(|x| matches!(x, "lib" | "a" | "dll" | "so" | "dylib")) + } + let mut out = Vec::new(); + let mut stack = vec![root.to_path_buf()]; + while let Some(dir) = stack.pop() { + let Ok(entries) = std::fs::read_dir(&dir) else { + continue; + }; + let mut has_lib = false; + for e in entries.filter_map(Result::ok) { + let p = e.path(); + if p.is_dir() { + stack.push(p); + } else if is_linkable(&p) { + has_lib = true; + } + } + if has_lib { + out.push(dir); + } + } + out +} + /// True if `dir` holds at least one dynamic library (`.dylib` / `.so` / `.dll`). /// Used under DL to decide which install dirs deserve a runtime rpath. fn dir_has_dynamic_lib(dir: &std::path::Path) -> bool { From 64004a42e7f5fef258f3ce9e7571f62d5c25526c Mon Sep 17 00:00:00 2001 From: Lorenzo Fiore Date: Tue, 9 Jun 2026 23:27:20 +0200 Subject: [PATCH 3/3] fix(dl): export parakeet symbols on Windows so the import lib is created MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The parakeet C API (parakeet_capi_*) is plain `extern "C"` with no __declspec(dllexport) or GGML_API-style export macro. On Windows a SHARED DLL with no exported symbols produces NO import library, so the consumer link failed with `LNK1181: cannot open input file 'parakeet.lib'` — the file was never created (the previous build-tree search couldn't find what doesn't exist). Set WINDOWS_EXPORT_ALL_SYMBOLS on the SHARED parakeet target so CMake auto-generates a .def exporting every public symbol and MSVC emits the import lib. No-op on ELF/Mach-O (default-export), so macOS static + DL builds and the DL Metal test are unaffected (verified locally). Pairs with the build-tree link-search walk: this creates parakeet.lib, that finds it. --- .../patches/parakeet/0001-backend-dl.patch | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/parakeet-cpp-sys/patches/parakeet/0001-backend-dl.patch b/parakeet-cpp-sys/patches/parakeet/0001-backend-dl.patch index fba52f1..0d7406b 100644 --- a/parakeet-cpp-sys/patches/parakeet/0001-backend-dl.patch +++ b/parakeet-cpp-sys/patches/parakeet/0001-backend-dl.patch @@ -1,5 +1,5 @@ diff --git a/CMakeLists.txt b/CMakeLists.txt -index da2dfb8..4d84c3d 100644 +index da2dfb8..a98c6d7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -14,16 +14,32 @@ option(PARAKEET_GGML_CUDA "Forward GGML_CUDA" OFF) @@ -37,6 +37,21 @@ index da2dfb8..4d84c3d 100644 set(GGML_NATIVE ON CACHE BOOL "ggml: optimize the build for the current system" FORCE) endif() if(NOT DEFINED GGML_LLAMAFILE) +@@ -84,6 +100,14 @@ set(PARAKEET_SRC + + if(PARAKEET_SHARED) + add_library(parakeet SHARED ${PARAKEET_SRC}) ++ # The C API (parakeet_capi_*) is plain `extern "C"` with no ++ # __declspec(dllexport) / GGML_API-style macro, so on Windows the SHARED ++ # parakeet.dll would export nothing and MSVC would emit NO import library ++ # (parakeet.lib) — the consumer link then fails with ++ # `LNK1181: cannot open input file 'parakeet.lib'`. Auto-export all symbols so ++ # the import lib is produced (CMake generates a .def of every public symbol). ++ # No-op on ELF/Mach-O, which export by default. ++ set_target_properties(parakeet PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON) + else() + add_library(parakeet STATIC ${PARAKEET_SRC}) + endif() diff --git a/include/parakeet_capi.h b/include/parakeet_capi.h index b082455..de21cd2 100644 --- a/include/parakeet_capi.h