-
Notifications
You must be signed in to change notification settings - Fork 0
docs: add openai-bridge-demo.mjs (sister of embeddinggemma-demo) #9
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
MauricioPerera
wants to merge
2
commits into
main
Choose a base branch
from
docs/openai-bridge-example
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,162 @@ | ||
| // Sister of embeddinggemma-demo.mjs that uses openai-workers-ai-bridge | ||
| // (https://github.com/MauricioPerera/openai-workers-ai-bridge) instead of | ||
| // calling the Workers AI REST API directly. | ||
| // | ||
| // What this demonstrates over the direct-REST demo: | ||
| // * Server-side Matryoshka truncation — passing `dimensions: 256` returns | ||
| // a 256-dim vector that's already L2-renormalised, so we don't truncate | ||
| // and pad on the client side. Smaller wire payloads, smaller storage. | ||
| // * One backend URL across every OpenAI-compatible client (this script, | ||
| // LangChain, n8n, LibreChat, the OpenAI SDK), no per-tool key plumbing. | ||
| // * Idempotent embedding cache: the bridge stashes vectors at the edge, | ||
| // so re-runs of the same script cost 0 neurons after the first. | ||
| // | ||
| // Run from repo root: | ||
| // pnpm install | ||
| // pnpm build | ||
| // node examples/smoke/openai-bridge-demo.mjs \ | ||
| // --bridge=https://openai-workers-ai-bridge.<sub>.workers.dev/v1 \ | ||
| // --key=sk-cfwai-... | ||
|
|
||
| import OpenAI from "openai"; | ||
| import { Bash } from "just-bash"; | ||
| import { createDataPlugin } from "just-bash-data"; | ||
| import { promises as fsp } from "node:fs"; | ||
| import * as nodePath from "node:path"; | ||
| import { fileURLToPath } from "node:url"; | ||
|
|
||
| const SCRIPT_DIR = nodePath.dirname(fileURLToPath(import.meta.url)); | ||
| const STATE_DIR = nodePath.resolve(SCRIPT_DIR, "agent-data-bridge"); | ||
|
|
||
| const args = Object.fromEntries( | ||
| process.argv.slice(2).map((a) => { | ||
| const m = /^--([^=]+)=(.*)$/.exec(a); | ||
| return m ? [m[1], m[2]] : [a, "true"]; | ||
| }), | ||
| ); | ||
| const BRIDGE_URL = args["bridge"] ?? process.env.BRIDGE_URL; | ||
| const API_KEY = args["key"] ?? process.env.BRIDGE_API_KEY; | ||
| if (!BRIDGE_URL || !API_KEY) { | ||
| console.error("Missing --bridge=<URL> --key=<API_KEY>"); | ||
| console.error("Or set BRIDGE_URL and BRIDGE_API_KEY env vars."); | ||
| process.exit(2); | ||
| } | ||
| const DIM = Number(args["dim"] ?? 256); | ||
|
|
||
| const client = new OpenAI({ apiKey: API_KEY, baseURL: BRIDGE_URL }); | ||
|
|
||
| async function embed(texts) { | ||
| const res = await client.embeddings.create({ | ||
| model: "embeddinggemma", | ||
| input: texts, | ||
| dimensions: DIM, | ||
| }); | ||
| return res.data.map((d) => d.embedding); | ||
| } | ||
|
|
||
| class DiskFs { | ||
| async readFile(p, opts) { | ||
| const enc = typeof opts === "string" ? opts : opts?.encoding ?? "utf8"; | ||
| return fsp.readFile(p, enc); | ||
| } | ||
| async readFileBuffer(p) { return new Uint8Array(await fsp.readFile(p)); } | ||
| async writeFile(p, content, opts) { | ||
| await fsp.mkdir(nodePath.dirname(p), { recursive: true }); | ||
| if (content instanceof Uint8Array) return fsp.writeFile(p, content); | ||
| const enc = typeof opts === "string" ? opts : opts?.encoding ?? "utf8"; | ||
| return fsp.writeFile(p, content, enc); | ||
| } | ||
| async exists(p) { try { await fsp.access(p); return true; } catch { return false; } } | ||
| async stat(p) { | ||
| const s = await fsp.stat(p); | ||
| return { isFile: s.isFile(), isDirectory: s.isDirectory(), isSymbolicLink: false, mode: s.mode, size: s.size, mtime: s.mtime }; | ||
| } | ||
| async mkdir(p, opts) { await fsp.mkdir(p, { recursive: opts?.recursive ?? false }); } | ||
| async readdir(p) { return fsp.readdir(p); } | ||
| async rm(p, opts) { | ||
| try { await fsp.rm(p, { force: opts?.force ?? false, recursive: opts?.recursive ?? false }); } | ||
| catch (e) { if (!opts?.force) throw e; } | ||
| } | ||
| async mv(src, dest) { return fsp.rename(src, dest); } | ||
| resolvePath(base, p) { return nodePath.resolve(base, p); } | ||
| async appendFile() { throw new Error("ni"); } | ||
| async cp() { throw new Error("ni"); } | ||
| async chmod() {} | ||
| async symlink() { throw new Error("ni"); } | ||
| async link() { throw new Error("ni"); } | ||
| async readlink() { throw new Error("ni"); } | ||
| async lstat(p) { return this.stat(p); } | ||
| async realpath(p) { return p; } | ||
| async utimes() {} | ||
| getAllPaths() { return []; } | ||
| } | ||
|
|
||
| const concepts = [ | ||
| { id: "en-cat", lang: "en", text: "A cat is a small carnivorous mammal kept as a pet." }, | ||
| { id: "es-cat", lang: "es", text: "Un gato es un pequeño mamífero carnívoro mantenido como mascota." }, | ||
| { id: "ja-cat", lang: "ja", text: "猫は飼われている小さな肉食哺乳類です。" }, | ||
| { id: "ar-cat", lang: "ar", text: "القطة حيوان ثديي مفترس صغير يربى كحيوان أليف." }, | ||
| { id: "hi-cat", lang: "hi", text: "बिल्ली एक छोटा मांसाहारी स्तनधारी है जो पालतू रखा जाता है।" }, | ||
| { id: "en-rocket", lang: "en", text: "A rocket is a vehicle that uses thrust to travel into space." }, | ||
| { id: "es-rocket", lang: "es", text: "Un cohete es un vehículo que usa empuje para viajar al espacio." }, | ||
| { id: "ja-rocket", lang: "ja", text: "ロケットは推進力を使って宇宙へ飛ぶ乗り物です。" }, | ||
| { id: "ar-rocket", lang: "ar", text: "الصاروخ مركبة تستخدم الدفع للسفر إلى الفضاء." }, | ||
| { id: "hi-rocket", lang: "hi", text: "रॉकेट एक वाहन है जो अंतरिक्ष में यात्रा करने के लिए जोर का उपयोग करता है।" }, | ||
| { id: "en-bread", lang: "en", text: "Bread is a staple food made from flour and water, then baked." }, | ||
| { id: "es-bread", lang: "es", text: "El pan es un alimento básico hecho de harina y agua, luego horneado." }, | ||
| { id: "ja-bread", lang: "ja", text: "パンは小麦粉と水から作られて焼かれる主食です。" }, | ||
| { id: "ar-bread", lang: "ar", text: "الخبز غذاء أساسي يُصنع من الدقيق والماء ثم يُخبز." }, | ||
| { id: "hi-bread", lang: "hi", text: "रोटी आटे और पानी से बनी और बेक की गई एक मुख्य खाद्य है।" }, | ||
| ]; | ||
|
|
||
| console.log(`══ openai-workers-ai-bridge × just-bash-data demo (dim=${DIM}) ══\n`); | ||
|
|
||
| console.log(`[1/4] Embedding ${concepts.length} concepts via the bridge (Matryoshka dim=${DIM})…`); | ||
| const t0 = Date.now(); | ||
| const vectors = await embed(concepts.map((c) => c.text)); | ||
| const embedMs = Date.now() - t0; | ||
| console.log(` ✓ Got ${vectors.length} vectors of dim ${vectors[0].length} in ${embedMs} ms`); | ||
| console.log(` (re-running this script costs 0 neurons — bridge edge-caches identical inputs)\n`); | ||
|
|
||
| console.log(`[2/4] Booting just-bash-data plugin (disk-backed at ${STATE_DIR})…`); | ||
| await fsp.rm(STATE_DIR, { recursive: true, force: true }); | ||
| const fs = new DiskFs(); | ||
| const bash = new Bash({ fs, cwd: STATE_DIR, customCommands: createDataPlugin({ rootDir: STATE_DIR }) }); | ||
| const exec = async (line) => { | ||
| const r = await bash.exec(line); | ||
| if (r.exitCode !== 0) throw new Error(`[exit ${r.exitCode}] ${line}\n${r.stderr}`); | ||
| return r.stdout ? JSON.parse(r.stdout) : null; | ||
| }; | ||
| await exec(`vec create concepts --dim ${DIM} --metric cosine`); | ||
| console.log(` ✓ Collection 'concepts' (cosine, ${DIM}d float32)\n`); | ||
|
|
||
| console.log(`[3/4] Inserting all ${concepts.length} vectors…`); | ||
| const insertStart = Date.now(); | ||
| for (let i = 0; i < concepts.length; i++) { | ||
| const c = concepts[i]; | ||
| const meta = JSON.stringify({ lang: c.lang, text: c.text }); | ||
| await exec(`vec store concepts ${c.id} '${JSON.stringify(vectors[i])}' --meta '${meta}'`); | ||
| } | ||
| const insertMs = Date.now() - insertStart; | ||
| console.log(` ✓ ${concepts.length} inserts in ${insertMs} ms (${(insertMs / concepts.length).toFixed(1)} ms/insert)\n`); | ||
|
|
||
| console.log(`[4/4] Cross-lingual search (k=3)…`); | ||
| const queries = [ | ||
| { lang: "en→all", q: "A small furry pet that purrs" }, | ||
| { lang: "es→all", q: "Vehículo espacial con propulsión" }, | ||
| { lang: "ja→all", q: "小麦から作る焼いた食べ物" }, | ||
| ]; | ||
| const qVecs = await embed(queries.map((q) => q.q)); | ||
| for (let i = 0; i < queries.length; i++) { | ||
| const q = queries[i]; | ||
| const v = qVecs[i]; | ||
| const hits = await exec(`vec search concepts '${JSON.stringify(v)}' --k 3`); | ||
| const ids = hits.map((h) => `${h.id}@${h.score.toFixed(3)}`).join(", "); | ||
| console.log(` [${q.lang}] "${q.q}"`); | ||
| console.log(` → ${ids}`); | ||
| } | ||
| console.log(""); | ||
|
|
||
| console.log(`── done. The bridge handed back ${DIM}-dim L2-normalised vectors,`); | ||
| console.log(` so just-bash-data's vec store consumed them as-is — no client-side`); | ||
| console.log(` truncation, no padding, no extra renorm.`); | ||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The
DiskFs.readFileimplementation incorrectly defaults toutf8encoding when no options are provided. In theIFileSysteminterface (and standard Node.jsfs),readFileshould return aUint8Array(or Buffer) if no encoding is specified. Defaulting toutf8will cause data corruption when reading binary files, such as the.binfiles used by the vector store for quantized embeddings.