diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 772ef05..680479b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -39,11 +39,14 @@ jobs: - os: ubuntu-latest kind: dl features: "--features dynamic-backends,vulkan" - # NOTE: Windows DL is intentionally NOT built yet. MSVC cannot defer the - # direct ggml_backend_cpu_init/_set_n_threads symbol references at DLL link - # time (no `-undefined dynamic_lookup` equivalent → LNK2019). A follow-up - # parakeet patch must route CPU-backend access through the ggml registry - # before Windows DL can link. Windows is not functional yet anyway. + # Windows: DL + Vulkan. The parakeet patch routes all CPU-backend access + # through the ggml device registry (no direct ggml_backend_cpu_* symbol + # references), so the shared parakeet/ggml core links cleanly under MSVC — + # no `-undefined dynamic_lookup` equivalent needed. The CPU + Vulkan + # backends ship as loadable modules, dlopen'd at runtime. + - os: windows-latest + kind: dl + features: "--features dynamic-backends,vulkan" env: # Pin a known-good LunarG SDK version for Windows (see llama.cpp CI). VULKAN_VERSION: "1.4.313.2" diff --git a/parakeet-cpp-sys/build.rs b/parakeet-cpp-sys/build.rs index 491a475..1dca21a 100644 --- a/parakeet-cpp-sys/build.rs +++ b/parakeet-cpp-sys/build.rs @@ -97,7 +97,7 @@ fn main() { // backend MODULES land in `bin/`. The static build keeps everything as // `lib*.a` / `*.lib` across `lib/` + the build tree (+ `Release/` on the // Windows multi-config generators). - let lib_dirs = [ + let mut lib_dirs = vec![ dst.join("lib"), dst.join("bin"), dst.join("build"), @@ -105,6 +105,20 @@ fn main() { dst.join("bin").join("Release"), dst.join("build").join("Release"), ]; + // parakeet.cpp has NO install() rule, so the parakeet library (static `.lib`/ + // `.a`, or — under DL — the SHARED `.dll`/.so/.dylib + its MSVC import `.lib`) + // is never copied to the install prefix; it only exists somewhere in the build + // tree. ggml redirects DLLs to `/bin` via CMAKE_RUNTIME_OUTPUT_DIRECTORY + // (a directory-scoped var that does NOT propagate to the parent parakeet + // scope), and the Ninja vs multi-config generators differ on where the import + // `.lib` lands. Rather than enumerate every layout, walk the whole build tree + // and add every dir that holds a linkable artifact, so `parakeet.lib` (the DL + // import lib that the consumer exe links) is found regardless of generator. + for dir in find_link_dirs(&dst.join("build")) { + if !lib_dirs.contains(&dir) { + lib_dirs.push(dir); + } + } for dir in &lib_dirs { println!("cargo:rustc-link-search=native={}", dir.display()); } @@ -294,6 +308,40 @@ fn apply_patches(dir: &std::path::Path, root: &std::path::Path) { } } +/// Recursively collect every directory under `root` that contains at least one +/// linkable artifact (`.lib` / `.a` / `.dll` / `.so` / `.dylib`). Used to find +/// the parakeet library in the build tree (it has no install rule, and its exact +/// location varies by generator/platform). Returns an empty vec if `root` is +/// absent. Directories are returned deepest-first is NOT guaranteed; order is +/// irrelevant for link-search. +fn find_link_dirs(root: &std::path::Path) -> Vec { + fn is_linkable(p: &std::path::Path) -> bool { + p.extension() + .and_then(|x| x.to_str()) + .is_some_and(|x| matches!(x, "lib" | "a" | "dll" | "so" | "dylib")) + } + let mut out = Vec::new(); + let mut stack = vec![root.to_path_buf()]; + while let Some(dir) = stack.pop() { + let Ok(entries) = std::fs::read_dir(&dir) else { + continue; + }; + let mut has_lib = false; + for e in entries.filter_map(Result::ok) { + let p = e.path(); + if p.is_dir() { + stack.push(p); + } else if is_linkable(&p) { + has_lib = true; + } + } + if has_lib { + out.push(dir); + } + } + out +} + /// True if `dir` holds at least one dynamic library (`.dylib` / `.so` / `.dll`). /// Used under DL to decide which install dirs deserve a runtime rpath. fn dir_has_dynamic_lib(dir: &std::path::Path) -> bool { diff --git a/parakeet-cpp-sys/patches/parakeet/0001-backend-dl.patch b/parakeet-cpp-sys/patches/parakeet/0001-backend-dl.patch index cb8a949..0d7406b 100644 --- a/parakeet-cpp-sys/patches/parakeet/0001-backend-dl.patch +++ b/parakeet-cpp-sys/patches/parakeet/0001-backend-dl.patch @@ -1,5 +1,5 @@ diff --git a/CMakeLists.txt b/CMakeLists.txt -index da2dfb8..4d84c3d 100644 +index da2dfb8..a98c6d7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -14,16 +14,32 @@ option(PARAKEET_GGML_CUDA "Forward GGML_CUDA" OFF) @@ -37,6 +37,21 @@ index da2dfb8..4d84c3d 100644 set(GGML_NATIVE ON CACHE BOOL "ggml: optimize the build for the current system" FORCE) endif() if(NOT DEFINED GGML_LLAMAFILE) +@@ -84,6 +100,14 @@ set(PARAKEET_SRC + + if(PARAKEET_SHARED) + add_library(parakeet SHARED ${PARAKEET_SRC}) ++ # The C API (parakeet_capi_*) is plain `extern "C"` with no ++ # __declspec(dllexport) / GGML_API-style macro, so on Windows the SHARED ++ # parakeet.dll would export nothing and MSVC would emit NO import library ++ # (parakeet.lib) — the consumer link then fails with ++ # `LNK1181: cannot open input file 'parakeet.lib'`. Auto-export all symbols so ++ # the import lib is produced (CMake generates a .def of every public symbol). ++ # No-op on ELF/Mach-O, which export by default. ++ set_target_properties(parakeet PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON) + else() + add_library(parakeet STATIC ${PARAKEET_SRC}) + endif() diff --git a/include/parakeet_capi.h b/include/parakeet_capi.h index b082455..de21cd2 100644 --- a/include/parakeet_capi.h @@ -58,6 +73,97 @@ index b082455..de21cd2 100644 #ifdef __cplusplus } // extern "C" #endif +diff --git a/src/backend.cpp b/src/backend.cpp +index 40055f9..03e2a56 100644 +--- a/src/backend.cpp ++++ b/src/backend.cpp +@@ -6,7 +6,6 @@ + #include "ggml.h" + #include "ggml-alloc.h" + #include "ggml-backend.h" +-#include "ggml-cpu.h" + + #include + #include +@@ -45,6 +44,45 @@ struct PendingCapture { + ggml_tensor* tensor; + std::vector* dst; + }; ++ ++// CPU-backend access routed through the ggml device registry instead of the ++// CPU-module symbols (ggml_backend_cpu_init / _is_cpu / _set_n_threads). Under ++// GGML_BACKEND_DL the CPU backend is a dlopen'd module, so its symbols are NOT ++// resolvable at link time (MSVC rejects this with LNK2019); the registry API ++// (in ggml-base, always linked) is. The registry is populated for BOTH the ++// static build (GGML_USE_CPU registers the CPU device at startup) and the DL ++// build (global_backend() runs ggml_backend_load_all before any Backend is ++// constructed), so this single path works for both — no #ifdef needed. ++ ++// Create a CPU backend via the registry: look up the CPU device, then init it. ++// Returns nullptr if no CPU device is registered. ++ggml_backend_t cpu_backend_init() { ++ ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); ++ return dev ? ggml_backend_dev_init(dev, nullptr) : nullptr; ++} ++ ++// True if `backend` is a CPU-type device (registry replacement for ++// ggml_backend_is_cpu). ++bool backend_is_cpu(ggml_backend_t backend) { ++ if (!backend) return false; ++ ggml_backend_dev_t dev = ggml_backend_get_device(backend); ++ return dev && ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU; ++} ++ ++// Set the thread count on a CPU backend without linking the CPU-module symbol: ++// fetch ggml_backend_set_n_threads from the backend's registry entry via ++// proc-address (the standard ggml way for dynamically-loaded backends — this is ++// exactly what llama.cpp does). No-op if the backend/device/reg/proc is absent. ++void backend_set_n_threads(ggml_backend_t backend, int n_threads) { ++ if (!backend) return; ++ ggml_backend_dev_t dev = ggml_backend_get_device(backend); ++ if (!dev) return; ++ ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev); ++ if (!reg) return; ++ auto fn = (ggml_backend_set_n_threads_t)ggml_backend_reg_get_proc_address( ++ reg, "ggml_backend_set_n_threads"); ++ if (fn) fn(backend, n_threads); ++} + } // namespace + + struct Backend::Impl { +@@ -124,7 +162,7 @@ Backend::Backend(int n_threads) : impl_(new Impl()) { + want.c_str()); + } + if (!impl_->backend) { // CPU fallback (or CPU-only build) +- impl_->backend = ggml_backend_cpu_init(); ++ impl_->backend = cpu_backend_init(); + device_name_ = "cpu"; + } + if (!impl_->backend) { +@@ -136,7 +174,7 @@ Backend::Backend(int n_threads) : impl_(new Impl()) { + // instead of aborting. The CPU/single-backend path keeps using the persistent + // gallocr below and is untouched. + if (impl_->use_sched) { +- impl_->cpu_backend = ggml_backend_cpu_init(); ++ impl_->cpu_backend = cpu_backend_init(); + if (!impl_->cpu_backend) { + PK_LOG("pk::Backend: CPU fallback init failed; disabling sched"); + impl_->use_sched = false; +@@ -159,11 +197,11 @@ Backend::~Backend() { + + void Backend::set_n_threads(int n_threads) { + n_threads_ = n_threads > 0 ? n_threads : 1; +- if (impl_ && impl_->backend && ggml_backend_is_cpu(impl_->backend)) { +- ggml_backend_cpu_set_n_threads(impl_->backend, n_threads_); ++ if (impl_ && impl_->backend && backend_is_cpu(impl_->backend)) { ++ backend_set_n_threads(impl_->backend, n_threads_); + } + if (impl_ && impl_->cpu_backend) { +- ggml_backend_cpu_set_n_threads(impl_->cpu_backend, n_threads_); ++ backend_set_n_threads(impl_->cpu_backend, n_threads_); + } + } + diff --git a/src/ggml_graph.cpp b/src/ggml_graph.cpp index f5bf84a..6b9b45c 100644 --- a/src/ggml_graph.cpp @@ -94,6 +200,53 @@ index f5bf84a..6b9b45c 100644 // Lazy create (reset-safe: shutdown_backend() can free it, and a later call // recreates it). Always reached under g_backend_mutex (run_graph holds it) // or before any inference thread exists, so a plain null-check is sufficient. +diff --git a/src/model_loader.cpp b/src/model_loader.cpp +index a1a0d66..b138216 100644 +--- a/src/model_loader.cpp ++++ b/src/model_loader.cpp +@@ -3,8 +3,8 @@ + #include "ggml.h" + #include "ggml-backend.h" + #include "ggml-alloc.h" +-#include "ggml-cpu.h" + #include "gguf.h" ++#include + #include + #include + #include +@@ -73,18 +73,25 @@ bool ModelLoader::realize_weights(ggml_backend_t backend){ + if(weights_buf_) return true; // idempotent + if(!backend || !ctx_){ PK_LOG("realize_weights: null backend/ctx"); return false; } + +- if (ggml_backend_is_cpu(backend)) { ++ // CPU access is routed through the ggml device registry rather than the ++ // CPU-module symbols (ggml_backend_is_cpu / ggml_backend_cpu_buffer_from_ptr), ++ // so this links under GGML_BACKEND_DL (CPU is a dlopen'd module whose symbols ++ // are not available at link time) as well as in the static build. The CPU ++ // device's buffer_from_host_ptr maps to ggml_backend_cpu_buffer_from_ptr, so ++ // the zero-copy semantics below are unchanged. ++ ggml_backend_dev_t dev = ggml_backend_get_device(backend); ++ if (dev && ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) { + // Fast path: borrow the host ctx memory directly (no copy). + // The GGUF is loaded with no_alloc=false, so every tensor's data lives + // in one contiguous ctx mem_buffer. Wrap that exact memory as a CPU +- // backend buffer (zero-copy: ggml_backend_cpu_buffer_from_ptr borrows +- // the ptr) and point every tensor's ->buffer at it, so graphs can +- // reference the loader tensors DIRECTLY as leaves (the gallocr treats +- // data!=NULL tensors as already-allocated and never copies them; +- // reshapes/views resolve at build time). Eliminates per-call recopy. ++ // backend buffer (zero-copy: buffer_from_host_ptr borrows the ptr) and ++ // point every tensor's ->buffer at it, so graphs can reference the loader ++ // tensors DIRECTLY as leaves (the gallocr treats data!=NULL tensors as ++ // already-allocated and never copies them; reshapes/views resolve at ++ // build time). Eliminates per-call recopy. + void* base = ggml_get_mem_buffer(ctx_); + size_t size = ggml_get_mem_size(ctx_); +- weights_buf_ = ggml_backend_cpu_buffer_from_ptr(base, size); ++ weights_buf_ = ggml_backend_dev_buffer_from_host_ptr(dev, base, size, SIZE_MAX); + if(!weights_buf_){ PK_LOG("realize_weights: buffer_from_ptr failed"); return false; } + for(auto& kv : tensors_) kv.second->buffer = weights_buf_; + return true; diff --git a/src/parakeet_capi.cpp b/src/parakeet_capi.cpp index 01de213..1ddc2f0 100644 --- a/src/parakeet_capi.cpp