Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 8 additions & 5 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,14 @@ jobs:
- os: ubuntu-latest
kind: dl
features: "--features dynamic-backends,vulkan"
# NOTE: Windows DL is intentionally NOT built yet. MSVC cannot defer the
# direct ggml_backend_cpu_init/_set_n_threads symbol references at DLL link
# time (no `-undefined dynamic_lookup` equivalent → LNK2019). A follow-up
# parakeet patch must route CPU-backend access through the ggml registry
# before Windows DL can link. Windows is not functional yet anyway.
# Windows: DL + Vulkan. The parakeet patch routes all CPU-backend access
# through the ggml device registry (no direct ggml_backend_cpu_* symbol
# references), so the shared parakeet/ggml core links cleanly under MSVC —
# no `-undefined dynamic_lookup` equivalent needed. The CPU + Vulkan
# backends ship as loadable modules, dlopen'd at runtime.
- os: windows-latest
kind: dl
features: "--features dynamic-backends,vulkan"
env:
# Pin a known-good LunarG SDK version for Windows (see llama.cpp CI).
VULKAN_VERSION: "1.4.313.2"
Expand Down
50 changes: 49 additions & 1 deletion parakeet-cpp-sys/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -97,14 +97,28 @@ fn main() {
// backend MODULES land in `bin/`. The static build keeps everything as
// `lib*.a` / `*.lib` across `lib/` + the build tree (+ `Release/` on the
// Windows multi-config generators).
let lib_dirs = [
let mut lib_dirs = vec![
dst.join("lib"),
dst.join("bin"),
dst.join("build"),
dst.join("lib").join("Release"),
dst.join("bin").join("Release"),
dst.join("build").join("Release"),
];
// parakeet.cpp has NO install() rule, so the parakeet library (static `.lib`/
// `.a`, or — under DL — the SHARED `.dll`/.so/.dylib + its MSVC import `.lib`)
// is never copied to the install prefix; it only exists somewhere in the build
// tree. ggml redirects DLLs to `<build>/bin` via CMAKE_RUNTIME_OUTPUT_DIRECTORY
// (a directory-scoped var that does NOT propagate to the parent parakeet
// scope), and the Ninja vs multi-config generators differ on where the import
// `.lib` lands. Rather than enumerate every layout, walk the whole build tree
// and add every dir that holds a linkable artifact, so `parakeet.lib` (the DL
// import lib that the consumer exe links) is found regardless of generator.
for dir in find_link_dirs(&dst.join("build")) {
if !lib_dirs.contains(&dir) {
lib_dirs.push(dir);
}
}
for dir in &lib_dirs {
println!("cargo:rustc-link-search=native={}", dir.display());
}
Expand Down Expand Up @@ -294,6 +308,40 @@ fn apply_patches(dir: &std::path::Path, root: &std::path::Path) {
}
}

/// Recursively collect every directory under `root` that contains at least one
/// linkable artifact (`.lib` / `.a` / `.dll` / `.so` / `.dylib`). Used to find
/// the parakeet library in the build tree (it has no install rule, and its exact
/// location varies by generator/platform). Returns an empty vec if `root` is
/// absent. Directories are returned deepest-first is NOT guaranteed; order is
/// irrelevant for link-search.
fn find_link_dirs(root: &std::path::Path) -> Vec<PathBuf> {
fn is_linkable(p: &std::path::Path) -> bool {
p.extension()
.and_then(|x| x.to_str())
.is_some_and(|x| matches!(x, "lib" | "a" | "dll" | "so" | "dylib"))
}
let mut out = Vec::new();
let mut stack = vec![root.to_path_buf()];
while let Some(dir) = stack.pop() {
let Ok(entries) = std::fs::read_dir(&dir) else {
continue;
};
let mut has_lib = false;
for e in entries.filter_map(Result::ok) {
let p = e.path();
if p.is_dir() {
stack.push(p);
} else if is_linkable(&p) {
has_lib = true;
}
}
if has_lib {
out.push(dir);
}
}
out
}

/// True if `dir` holds at least one dynamic library (`.dylib` / `.so` / `.dll`).
/// Used under DL to decide which install dirs deserve a runtime rpath.
fn dir_has_dynamic_lib(dir: &std::path::Path) -> bool {
Expand Down
155 changes: 154 additions & 1 deletion parakeet-cpp-sys/patches/parakeet/0001-backend-dl.patch
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
diff --git a/CMakeLists.txt b/CMakeLists.txt
index da2dfb8..4d84c3d 100644
index da2dfb8..a98c6d7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -14,16 +14,32 @@ option(PARAKEET_GGML_CUDA "Forward GGML_CUDA" OFF)
Expand Down Expand Up @@ -37,6 +37,21 @@ index da2dfb8..4d84c3d 100644
set(GGML_NATIVE ON CACHE BOOL "ggml: optimize the build for the current system" FORCE)
endif()
if(NOT DEFINED GGML_LLAMAFILE)
@@ -84,6 +100,14 @@ set(PARAKEET_SRC

if(PARAKEET_SHARED)
add_library(parakeet SHARED ${PARAKEET_SRC})
+ # The C API (parakeet_capi_*) is plain `extern "C"` with no
+ # __declspec(dllexport) / GGML_API-style macro, so on Windows the SHARED
+ # parakeet.dll would export nothing and MSVC would emit NO import library
+ # (parakeet.lib) — the consumer link then fails with
+ # `LNK1181: cannot open input file 'parakeet.lib'`. Auto-export all symbols so
+ # the import lib is produced (CMake generates a .def of every public symbol).
+ # No-op on ELF/Mach-O, which export by default.
+ set_target_properties(parakeet PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
else()
add_library(parakeet STATIC ${PARAKEET_SRC})
endif()
diff --git a/include/parakeet_capi.h b/include/parakeet_capi.h
index b082455..de21cd2 100644
--- a/include/parakeet_capi.h
Expand All @@ -58,6 +73,97 @@ index b082455..de21cd2 100644
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/src/backend.cpp b/src/backend.cpp
index 40055f9..03e2a56 100644
--- a/src/backend.cpp
+++ b/src/backend.cpp
@@ -6,7 +6,6 @@
#include "ggml.h"
#include "ggml-alloc.h"
#include "ggml-backend.h"
-#include "ggml-cpu.h"

#include <cassert>
#include <cctype>
@@ -45,6 +44,45 @@ struct PendingCapture {
ggml_tensor* tensor;
std::vector<float>* dst;
};
+
+// CPU-backend access routed through the ggml device registry instead of the
+// CPU-module symbols (ggml_backend_cpu_init / _is_cpu / _set_n_threads). Under
+// GGML_BACKEND_DL the CPU backend is a dlopen'd module, so its symbols are NOT
+// resolvable at link time (MSVC rejects this with LNK2019); the registry API
+// (in ggml-base, always linked) is. The registry is populated for BOTH the
+// static build (GGML_USE_CPU registers the CPU device at startup) and the DL
+// build (global_backend() runs ggml_backend_load_all before any Backend is
+// constructed), so this single path works for both — no #ifdef needed.
+
+// Create a CPU backend via the registry: look up the CPU device, then init it.
+// Returns nullptr if no CPU device is registered.
+ggml_backend_t cpu_backend_init() {
+ ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+ return dev ? ggml_backend_dev_init(dev, nullptr) : nullptr;
+}
+
+// True if `backend` is a CPU-type device (registry replacement for
+// ggml_backend_is_cpu).
+bool backend_is_cpu(ggml_backend_t backend) {
+ if (!backend) return false;
+ ggml_backend_dev_t dev = ggml_backend_get_device(backend);
+ return dev && ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU;
+}
+
+// Set the thread count on a CPU backend without linking the CPU-module symbol:
+// fetch ggml_backend_set_n_threads from the backend's registry entry via
+// proc-address (the standard ggml way for dynamically-loaded backends — this is
+// exactly what llama.cpp does). No-op if the backend/device/reg/proc is absent.
+void backend_set_n_threads(ggml_backend_t backend, int n_threads) {
+ if (!backend) return;
+ ggml_backend_dev_t dev = ggml_backend_get_device(backend);
+ if (!dev) return;
+ ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
+ if (!reg) return;
+ auto fn = (ggml_backend_set_n_threads_t)ggml_backend_reg_get_proc_address(
+ reg, "ggml_backend_set_n_threads");
+ if (fn) fn(backend, n_threads);
+}
} // namespace

struct Backend::Impl {
@@ -124,7 +162,7 @@ Backend::Backend(int n_threads) : impl_(new Impl()) {
want.c_str());
}
if (!impl_->backend) { // CPU fallback (or CPU-only build)
- impl_->backend = ggml_backend_cpu_init();
+ impl_->backend = cpu_backend_init();
device_name_ = "cpu";
}
if (!impl_->backend) {
@@ -136,7 +174,7 @@ Backend::Backend(int n_threads) : impl_(new Impl()) {
// instead of aborting. The CPU/single-backend path keeps using the persistent
// gallocr below and is untouched.
if (impl_->use_sched) {
- impl_->cpu_backend = ggml_backend_cpu_init();
+ impl_->cpu_backend = cpu_backend_init();
if (!impl_->cpu_backend) {
PK_LOG("pk::Backend: CPU fallback init failed; disabling sched");
impl_->use_sched = false;
@@ -159,11 +197,11 @@ Backend::~Backend() {

void Backend::set_n_threads(int n_threads) {
n_threads_ = n_threads > 0 ? n_threads : 1;
- if (impl_ && impl_->backend && ggml_backend_is_cpu(impl_->backend)) {
- ggml_backend_cpu_set_n_threads(impl_->backend, n_threads_);
+ if (impl_ && impl_->backend && backend_is_cpu(impl_->backend)) {
+ backend_set_n_threads(impl_->backend, n_threads_);
}
if (impl_ && impl_->cpu_backend) {
- ggml_backend_cpu_set_n_threads(impl_->cpu_backend, n_threads_);
+ backend_set_n_threads(impl_->cpu_backend, n_threads_);
}
}

diff --git a/src/ggml_graph.cpp b/src/ggml_graph.cpp
index f5bf84a..6b9b45c 100644
--- a/src/ggml_graph.cpp
Expand Down Expand Up @@ -94,6 +200,53 @@ index f5bf84a..6b9b45c 100644
// Lazy create (reset-safe: shutdown_backend() can free it, and a later call
// recreates it). Always reached under g_backend_mutex (run_graph holds it)
// or before any inference thread exists, so a plain null-check is sufficient.
diff --git a/src/model_loader.cpp b/src/model_loader.cpp
index a1a0d66..b138216 100644
--- a/src/model_loader.cpp
+++ b/src/model_loader.cpp
@@ -3,8 +3,8 @@
#include "ggml.h"
#include "ggml-backend.h"
#include "ggml-alloc.h"
-#include "ggml-cpu.h"
#include "gguf.h"
+#include <cstdint>
#include <cstring>
#include <vector>
#include <utility>
@@ -73,18 +73,25 @@ bool ModelLoader::realize_weights(ggml_backend_t backend){
if(weights_buf_) return true; // idempotent
if(!backend || !ctx_){ PK_LOG("realize_weights: null backend/ctx"); return false; }

- if (ggml_backend_is_cpu(backend)) {
+ // CPU access is routed through the ggml device registry rather than the
+ // CPU-module symbols (ggml_backend_is_cpu / ggml_backend_cpu_buffer_from_ptr),
+ // so this links under GGML_BACKEND_DL (CPU is a dlopen'd module whose symbols
+ // are not available at link time) as well as in the static build. The CPU
+ // device's buffer_from_host_ptr maps to ggml_backend_cpu_buffer_from_ptr, so
+ // the zero-copy semantics below are unchanged.
+ ggml_backend_dev_t dev = ggml_backend_get_device(backend);
+ if (dev && ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
// Fast path: borrow the host ctx memory directly (no copy).
// The GGUF is loaded with no_alloc=false, so every tensor's data lives
// in one contiguous ctx mem_buffer. Wrap that exact memory as a CPU
- // backend buffer (zero-copy: ggml_backend_cpu_buffer_from_ptr borrows
- // the ptr) and point every tensor's ->buffer at it, so graphs can
- // reference the loader tensors DIRECTLY as leaves (the gallocr treats
- // data!=NULL tensors as already-allocated and never copies them;
- // reshapes/views resolve at build time). Eliminates per-call recopy.
+ // backend buffer (zero-copy: buffer_from_host_ptr borrows the ptr) and
+ // point every tensor's ->buffer at it, so graphs can reference the loader
+ // tensors DIRECTLY as leaves (the gallocr treats data!=NULL tensors as
+ // already-allocated and never copies them; reshapes/views resolve at
+ // build time). Eliminates per-call recopy.
void* base = ggml_get_mem_buffer(ctx_);
size_t size = ggml_get_mem_size(ctx_);
- weights_buf_ = ggml_backend_cpu_buffer_from_ptr(base, size);
+ weights_buf_ = ggml_backend_dev_buffer_from_host_ptr(dev, base, size, SIZE_MAX);
if(!weights_buf_){ PK_LOG("realize_weights: buffer_from_ptr failed"); return false; }
for(auto& kv : tensors_) kv.second->buffer = weights_buf_;
return true;
diff --git a/src/parakeet_capi.cpp b/src/parakeet_capi.cpp
index 01de213..1ddc2f0 100644
--- a/src/parakeet_capi.cpp
Expand Down
Loading