diff --git a/scripts/agentic-minimax.mjs b/scripts/agentic-minimax.mjs new file mode 100644 index 00000000..47870d6f --- /dev/null +++ b/scripts/agentic-minimax.mjs @@ -0,0 +1,593 @@ +#!/usr/bin/env node +/** + * Multi-step agentic test for minimax-m3:cloud through BOTH harnesses + * (Claude Agent SDK + Ollama, OpenCode + Ollama). + * + * Unlike the smoke tests (one tool, one shot), this gives the model a small + * toolbox over a fixture calendar + mailbox and a task that REQUIRES chaining: + * "check if I'm free this Friday and respond with a time" + * -> resolve "this Friday" -> list_calendar_events(date) -> find a free + * slot -> send_email_reply(proposed time) + * + * Every tool call routes through one shared executor that logs + * `→ name(args)` / `← result`, so the multi-step chain is visible and is + * identical across both harnesses. A per-scenario checker then verifies the + * model used the right tools in a sensible order and produced a correct reply. + * + * Usage: + * node scripts/agentic-minimax.mjs # both backends, both scenarios + * node scripts/agentic-minimax.mjs --backend claude + * node scripts/agentic-minimax.mjs --backend opencode --scenario friday + * node scripts/agentic-minimax.mjs --model minimax-m2.7:cloud + */ +import { dirname, join } from "node:path"; +import { fileURLToPath } from "node:url"; +import { loadEnv } from "./lib/load-env.mjs"; +import { z } from "zod"; + +const __dirname = dirname(fileURLToPath(import.meta.url)); + +loadEnv(join(__dirname, "..", ".env")); +loadEnv(join(__dirname, "..", ".env.local")); + +const OLLAMA_KEY = process.env.OLLAMA_API_KEY; +if (!OLLAMA_KEY) { + console.error("FAIL: OLLAMA_API_KEY required"); + process.exit(1); +} + +const arg = (flag, dflt) => { + const i = process.argv.indexOf(flag); + return i !== -1 && process.argv[i + 1] ? process.argv[i + 1] : dflt; +}; +const modelsArg = arg("--models", null); +const singleModel = arg("--model", null); +// null → main() uses its default 3-model head-to-head set. +const MODELS_OVERRIDE = modelsArg + ? modelsArg.split(",").map((s) => s.trim()).filter(Boolean) + : singleModel + ? [singleModel] + : null; +const ONLY_BACKEND = arg("--backend", null); // claude | opencode | null(both) +const ONLY_SCENARIO = arg("--scenario", null); // friday | invoice | null(both) +const TRIALS = Number(arg("--trials", "1")); + +function median(a) { + const v = a.filter((x) => x != null).sort((x, y) => x - y); + if (!v.length) return null; + const m = Math.floor(v.length / 2); + return v.length % 2 ? v[m] : (v[m - 1] + v[m]) / 2; +} + +const TODAY = "Sunday, 2026-05-31"; + +// --------------------------------------------------------------------------- +// Fixture world: a calendar and a mailbox the tools read from. +// --------------------------------------------------------------------------- +const CALENDAR = { + // "this Friday" from 2026-05-31 + "2026-06-05": { + workingHours: "09:00-17:00", + events: [ + { start: "09:00", end: "09:30", title: "Team standup" }, + { start: "11:00", end: "12:00", title: "Design review" }, + { start: "14:00", end: "15:00", title: "1:1 with Sam" }, + ], + }, +}; +// Free windows on 2026-06-05: 09:30-11:00, 12:00-14:00, 15:00-17:00. +const FRIDAY = "2026-06-05"; +const BUSY_FRIDAY = [ + ["09:00", "09:30"], + ["11:00", "12:00"], + ["14:00", "15:00"], +]; + +const MAILBOX = { + "inv-88": { + from: "billing@acme.com", + subject: "Invoice #88 — March services", + date: "2026-05-28", + body: "Hi, please find Invoice #88 attached. Amount due: $4,250.00. Due date: 2026-06-15. Remit to billing@acme.com. Thanks.", + }, + "inv-77": { + from: "billing@acme.com", + subject: "Invoice #77 — February services", + date: "2026-04-28", + body: "Invoice #77. Amount due: $3,900.00. Due date: 2026-05-15. (This invoice was paid on 2026-05-10.)", + }, +}; + +// --------------------------------------------------------------------------- +// Shared tool specs — one executor, used by both harnesses. +// --------------------------------------------------------------------------- +function makeTools(state) { + const log = (line) => { + console.log(line); + state.trace.push(line); + }; + const record = (name, args, result) => { + const t = state.t0 ? performance.now() - state.t0 : 0; + state.calls.push({ name, args, t }); + log(` [+${(t / 1000).toFixed(1)}s] → ${name}(${JSON.stringify(args)})`); + log(` ← ${JSON.stringify(result).slice(0, 220)}`); + return result; + }; + return [ + { + name: "list_calendar_events", + description: + "List the user's calendar events for a given day. date must be ISO format YYYY-MM-DD.", + shape: { date: z.string() }, + run: (a) => { + const day = CALENDAR[a.date]; + const result = day + ? { date: a.date, workingHours: day.workingHours, events: day.events } + : { date: a.date, workingHours: "09:00-17:00", events: [] }; + return record("list_calendar_events", a, result); + }, + }, + { + name: "search_emails", + description: "Search the mailbox. Returns matching messages (id, from, subject, date), newest first.", + shape: { query: z.string() }, + run: (a) => { + // Query-driven: a message matches if its subject or sender contains any + // term from the query. Newest first. + const terms = a.query.toLowerCase().split(/\s+/).filter(Boolean); + const hits = Object.entries(MAILBOX) + .filter(([, m]) => { + const hay = (m.subject + " " + m.from).toLowerCase(); + return terms.some((t) => hay.includes(t)); + }) + .map(([id, m]) => ({ id, from: m.from, subject: m.subject, date: m.date })) + .sort((x, y) => (x.date < y.date ? 1 : -1)); + return record("search_emails", a, { results: hits }); + }, + }, + { + name: "get_email", + description: "Fetch the full body of an email by id.", + shape: { id: z.string() }, + run: (a) => { + const m = MAILBOX[a.id]; + const result = m ? { id: a.id, ...m } : { id: a.id, error: "not found" }; + return record("get_email", a, result); + }, + }, + { + name: "send_email_reply", + description: "Send a reply email. Provide the recipient and the full plain-text body.", + shape: { to: z.string(), body: z.string() }, + run: (a) => { + state.sentReply = { to: a.to, body: a.body }; + return record("send_email_reply", { to: a.to, body: a.body.slice(0, 80) + "…" }, { status: "sent" }); + }, + }, + ]; +} + +// --------------------------------------------------------------------------- +// Scenarios +// --------------------------------------------------------------------------- +const SCENARIOS = { + friday: { + title: "Check if I'm free this Friday and respond with a time", + system: + `You are an executive's email assistant. Today is ${TODAY}. ` + + "Use the available tools to do real work — check the calendar before proposing any time, " + + "and actually send the reply with send_email_reply. Propose a specific time that is genuinely free. " + + "Keep the reply concise and warm.", + prompt: + "New email from Marcus Lee :\n" + + "Subject: Quick call this Friday?\n\n" + + "Hi — could we grab 30 minutes this Friday afternoon for a quick call about the Q3 plan? " + + "Let me know what time works.\n\n" + + "Task: check if I'm free this Friday afternoon and reply to Marcus proposing a specific time.", + check: (state) => { + const notes = []; + const calVals = state.calls.filter((c) => c.name === "list_calendar_events"); + const calledRightDay = calVals.some((c) => c.args.date === FRIDAY); + notes.push(`${calledRightDay ? "✓" : "✗"} queried calendar for ${FRIDAY} (got: ${calVals.map((c) => c.args.date).join(",") || "none"})`); + const calledBeforeReply = + state.calls.findIndex((c) => c.name === "list_calendar_events") < + state.calls.findIndex((c) => c.name === "send_email_reply"); + const sent = !!state.sentReply; + notes.push(`${sent ? "✓" : "✗"} sent a reply via send_email_reply`); + notes.push(`${calledBeforeReply ? "✓" : "✗"} checked calendar BEFORE replying`); + + // Did the proposed time land in a genuinely free window? + // A good reply often *also* lists existing commitments for transparency, + // written as ranges ("11:00–12:00"). The actual proposal is a standalone + // time. So strip ranges first, then judge the remaining standalone times. + let timeOk = null; + if (sent) { + // Strip commitment ranges ("11:00–12:00") the model lists for context. + const proposalText = state.sentReply.body.replace( + /\d{1,2}:\d{2}\s*[–\-—]\s*\d{1,2}:\d{2}/g, + " ", + ); + // A reply may transparently name an existing commitment ("I've got + // something at 2:00 but I'm free after"). Exclude any standalone time + // immediately preceded by a busy-cue so it isn't misread as a proposal. + const BUSY_CUE = /(got|have|having|something|booked|busy|blocked|books|meeting|conflict|already|1:1|standup|review)\b[^\d]{0,14}$/i; + const times = [...proposalText.matchAll(/(\d{1,2})(?::(\d{2}))?\s*(am|pm|AM|PM)?/g)] + .filter((m) => !BUSY_CUE.test(proposalText.slice(Math.max(0, m.index - 26), m.index))) + .map((m) => { + let h = Number(m[1]); + const min = Number(m[2] ?? 0); + const ap = (m[3] ?? "").toLowerCase(); + if (ap === "pm" && h < 12) h += 12; + if (ap === "am" && h === 12) h = 0; + if (!ap && h >= 1 && h <= 6) h += 12; // bare 1-6 → afternoon + return h * 60 + min; + }) + .filter((t) => t >= 9 * 60 && t <= 17 * 60); + const inBusy = (t) => + BUSY_FRIDAY.some(([s, e]) => { + const ms = Number(s.slice(0, 2)) * 60 + Number(s.slice(3)); + const me = Number(e.slice(0, 2)) * 60 + Number(e.slice(3)); + return t >= ms && t < me; + }); + const proposedFree = times.filter((t) => !inBusy(t)); + const proposedBusy = times.filter((t) => inBusy(t)); + // PASS if a standalone proposed time exists and none collide with a + // meeting. null (undetermined) if the reply only used ranges. + timeOk = times.length === 0 ? null : proposedBusy.length === 0; + notes.push( + `${timeOk === true ? "✓" : timeOk === null ? "·" : "✗"} proposed free time: [${proposedFree.map(min2s).join(", ") || "none parsed (ranges only)"}]` + + (proposedBusy.length ? ` CONFLICTS: [${proposedBusy.map(min2s).join(", ")}]` : ""), + ); + } + const pass = calledRightDay && sent && calledBeforeReply && timeOk !== false; + return { pass, notes }; + }, + }, + invoice: { + title: "Find Acme's latest invoice and confirm we'll pay by the due date", + system: + `You are an executive's email assistant. Today is ${TODAY}. ` + + "Use the tools to find the relevant email and read it before answering. " + + "When done, send a reply confirming payment, citing the exact amount and due date.", + prompt: + "Task: find the latest invoice from Acme (billing@acme.com), then reply to them confirming " + + "we will pay the amount due by the due date. Mention the exact amount and due date in the reply.", + check: (state) => { + const notes = []; + const searched = state.calls.some((c) => c.name === "search_emails"); + const gotLatest = state.calls.some((c) => c.name === "get_email" && c.args.id === "inv-88"); + const sent = !!state.sentReply; + notes.push(`${searched ? "✓" : "✗"} searched the mailbox`); + notes.push(`${gotLatest ? "✓" : "✗"} opened the LATEST invoice (inv-88, 2026-05-28)`); + notes.push(`${sent ? "✓" : "✗"} sent a confirmation reply`); + let amountOk = null; + let dueOk = null; + if (sent) { + const b = state.sentReply.body.replace(/,/g, ""); + amountOk = /4250(\.00)?/.test(b) || /\$4250/.test(b) || state.sentReply.body.includes("4,250"); + dueOk = /2026-06-15/.test(state.sentReply.body) || /june\s*15/i.test(state.sentReply.body); + notes.push(`${amountOk ? "✓" : "✗"} reply cites amount $4,250`); + notes.push(`${dueOk ? "✓" : "✗"} reply cites due date 2026-06-15`); + } + const pass = searched && gotLatest && sent && amountOk === true && dueOk === true; + return { pass, notes }; + }, + }, +}; + +function min2s(m) { + const h = Math.floor(m / 60); + const mm = String(m % 60).padStart(2, "0"); + const ap = h >= 12 ? "pm" : "am"; + const h12 = h % 12 === 0 ? 12 : h % 12; + return `${h12}:${mm}${ap}`; +} + +// --------------------------------------------------------------------------- +// Backend: Claude Agent SDK + Ollama (mirrors ClaudeAgentProvider) +// --------------------------------------------------------------------------- +const MODEL_ENV_VARS = [ + "ANTHROPIC_MODEL", + "ANTHROPIC_CUSTOM_MODEL", + "ANTHROPIC_DEFAULT_HAIKU_MODEL", + "ANTHROPIC_DEFAULT_SONNET_MODEL", + "ANTHROPIC_DEFAULT_OPUS_MODEL", + "ANTHROPIC_SMALL_FAST_MODEL", + "CLAUDE_CODE_SUBAGENT_MODEL", +]; + +async function runClaude(scenario, state, model) { + const { query, tool, createSdkMcpServer } = await import("@anthropic-ai/claude-agent-sdk"); + const tools = makeTools(state); + const childEnv = { ...process.env }; + childEnv.ANTHROPIC_BASE_URL = "https://ollama.com"; + childEnv.ANTHROPIC_AUTH_TOKEN = OLLAMA_KEY; + childEnv.ANTHROPIC_API_KEY = OLLAMA_KEY; + for (const k of MODEL_ENV_VARS) childEnv[k] = model; + childEnv.DISABLE_TELEMETRY = "1"; + childEnv.DISABLE_ERROR_REPORTING = "1"; + childEnv.CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC = "1"; + delete childEnv.CLAUDECODE; + + const mcpServer = createSdkMcpServer({ + name: "mail-app-tools", + version: "1.0.0", + tools: tools.map((t) => + tool(t.name, t.description, t.shape, async (args) => ({ + content: [{ type: "text", text: JSON.stringify(t.run(args)) }], + })), + ), + }); + + state.t0 = performance.now(); + const q = query({ + prompt: scenario.prompt, + options: { + model, + systemPrompt: scenario.system, + mcpServers: { "mail-app-tools": mcpServer }, + allowedTools: tools.map((t) => `mcp__mail-app-tools__${t.name}`), + maxTurns: 16, + permissionMode: "bypassPermissions", + settingSources: [], + persistSession: false, + env: childEnv, + stderr: () => {}, + }, + }); + + for await (const msg of q) { + if (msg.type === "assistant") { + for (const block of msg.message.content ?? []) { + if (block.type === "text" && block.text.trim()) { + state.assistantText = (state.assistantText ?? "") + block.text; + } + } + } + } + state.elapsedMs = performance.now() - state.t0; +} + +// --------------------------------------------------------------------------- +// Backend: OpenCode + Ollama (mirrors OpenCodeAgentProvider) +// --------------------------------------------------------------------------- +async function runOpencode(scenario, state, model) { + const binDir = join(__dirname, "..", "node_modules", ".bin"); + process.env.PATH = `${binDir}:${process.env.PATH ?? ""}`; + const { createOpencodeServer } = await import("@opencode-ai/sdk"); + const { createOpencodeClient } = await import("@opencode-ai/sdk/client"); + const { McpServer } = await import("@modelcontextprotocol/sdk/server/mcp.js"); + const { StreamableHTTPServerTransport } = await import( + "@modelcontextprotocol/sdk/server/streamableHttp.js" + ); + const { createServer } = await import("node:http"); + const { randomUUID } = await import("node:crypto"); + + const tools = makeTools(state); + const mcp = new McpServer({ name: "mail-app-tools", version: "1.0.0" }); + for (const t of tools) { + mcp.registerTool( + t.name, + { description: t.description, inputSchema: t.shape }, + async (args) => ({ content: [{ type: "text", text: JSON.stringify(t.run(args)) }] }), + ); + } + const transport = new StreamableHTTPServerTransport({ sessionIdGenerator: () => randomUUID() }); + await mcp.connect(transport); + const httpServer = createServer((req, res) => { + const chunks = []; + req.on("data", (c) => chunks.push(c)); + req.on("end", () => { + const s = Buffer.concat(chunks).toString("utf8"); + let body; + try { + body = s ? JSON.parse(s) : undefined; + } catch { + /* ignore */ + } + transport.handleRequest(req, res, body).catch(() => {}); + }); + }); + await new Promise((r) => httpServer.listen(0, "127.0.0.1", r)); + const bridgeUrl = `http://127.0.0.1:${httpServer.address().port}/mcp`; + + const server = await createOpencodeServer({ + hostname: "127.0.0.1", + port: 0, + timeout: 60_000, + config: { + logLevel: "WARN", + mcp: { "mail-app-tools": { type: "remote", url: bridgeUrl, enabled: true } }, + permission: { edit: "allow", bash: "allow", webfetch: "allow" }, + disabled_providers: ["github-copilot", "openrouter", "google", "groq", "deepseek", "anthropic"], + provider: { + "ollama-cloud": { + name: "Ollama Cloud", + npm: "@ai-sdk/openai-compatible", + options: { baseURL: "https://ollama.com/v1", apiKey: OLLAMA_KEY }, + models: { [model]: { id: model, name: model, tool_call: true } }, + }, + }, + }, + }); + const client = createOpencodeClient({ baseUrl: server.url }); + const session = await client.session.create({ body: { title: "agentic-minimax" } }); + const sessionId = session.data.id; + + const abort = new AbortController(); + const evs = (await client.event.subscribe({ signal: abort.signal })).stream[Symbol.asyncIterator](); + state.t0 = performance.now(); + const promptPromise = client.session + .promptAsync({ + path: { id: sessionId }, + body: { + model: { providerID: "ollama-cloud", modelID: model }, + system: scenario.system, + tools: { write: false, edit: false, read: false, glob: false, grep: false, bash: false }, + parts: [{ type: "text", text: scenario.prompt }], + }, + }) + .catch((e) => { + state.error = e?.message ?? String(e); + abort.abort(); + }); + + const timeout = setTimeout(() => { + state.error = "150s timeout"; + abort.abort(); + }, 150_000); + while (true) { + const step = await evs.next(); + if (step.done) break; + const ev = step.value; + if (ev.type === "message.part.updated") { + const part = ev.properties.part; + if (part.type === "text" && part.sessionID === sessionId && part.text) { + state.assistantText = part.text; + } + } + if (ev.type === "session.idle" && ev.properties.sessionID === sessionId) break; + if (ev.type === "session.error" && ev.properties.sessionID === sessionId) { + state.error = JSON.stringify(ev.properties.error); + break; + } + } + clearTimeout(timeout); + state.elapsedMs = performance.now() - state.t0; + abort.abort(); + await promptPromise; + server.close(); + httpServer.close(); +} + +// --------------------------------------------------------------------------- +// Driver +// --------------------------------------------------------------------------- +function fmtTimeline(state) { + const parts = state.calls.map((c) => `+${(c.t / 1000).toFixed(1)}s ${c.name}`); + parts.push(`${(state.elapsedMs / 1000).toFixed(1)}s done`); + return parts.join(" · "); +} + +async function runOne(model, backend, scenarioKey) { + const scenario = SCENARIOS[scenarioKey]; + const state = { calls: [], trace: [], sentReply: null, assistantText: "", error: null, t0: 0 }; + console.log(`\n${"━".repeat(78)}`); + console.log(`▶ ${model} [${backend}] ${scenario.title}`); + console.log(`${"━".repeat(78)}`); + console.log(" tool trace (with elapsed-since-prompt):"); + try { + if (backend === "claude") await runClaude(scenario, state, model); + else await runOpencode(scenario, state, model); + } catch (e) { + state.error = e instanceof Error ? e.message : String(e); + } + if (state.error) console.log(` ⚠ run error: ${state.error}`); + + const { pass, notes } = scenario.check(state); + console.log(`\n checks:`); + for (const n of notes) console.log(` ${n}`); + console.log(` timeline: ${fmtTimeline(state)}`); + console.log(` reply:`); + console.log( + (state.sentReply?.body ?? state.assistantText ?? "(none)") + .trim() + .split("\n") + .map((l) => " | " + l) + .join("\n"), + ); + const verdict = pass && !state.error; + const firstToolMs = state.calls[0]?.t ?? null; + console.log( + `\n steps=${state.calls.length} 1st-action=${firstToolMs != null ? (firstToolMs / 1000).toFixed(1) + "s" : "—"} total=${(state.elapsedMs / 1000).toFixed(1)}s VERDICT: ${verdict ? "PASS ✅" : "FAIL ❌"}`, + ); + return { + model, + backend, + scenarioKey, + pass: verdict, + steps: state.calls.length, + ms: state.elapsedMs, + firstToolMs, + }; +} + +async function main() { + // Default to a head-to-head: the new model vs the current default vs prior MiniMax. + const models = MODELS_OVERRIDE ?? ["minimax-m3:cloud", "kimi-k2.6:cloud", "minimax-m2.7:cloud"]; + const backends = ONLY_BACKEND ? [ONLY_BACKEND] : ["claude", "opencode"]; + const scenarios = ONLY_SCENARIO ? [ONLY_SCENARIO] : ["friday", "invoice"]; + console.log(`=== Multi-step SPEED comparison · today=${TODAY} ===`); + console.log(`models: ${models.join(", ")}`); + console.log(`backends: ${backends.join(", ")} scenarios: ${scenarios.join(", ")}`); + + console.log(`trials: ${TRIALS} per cell (median reported)`); + const rows = []; + for (const m of models) { + for (const s of scenarios) { + for (const b of backends) { + for (let i = 0; i < TRIALS; i++) rows.push(await runOne(m, b, s)); + } + } + } + + // Aggregate by (model, backend, scenario) over trials. + const cells = []; + for (const m of models) { + for (const s of scenarios) { + for (const b of backends) { + const g = rows.filter((r) => r.model === m && r.backend === b && r.scenarioKey === s); + if (!g.length) continue; + cells.push({ + model: m, + backend: b, + scenario: s, + n: g.length, + steps: median(g.map((r) => r.steps)), + firstMs: median(g.map((r) => r.firstToolMs)), + totalMs: median(g.map((r) => r.ms)), + passes: g.filter((r) => r.pass).length, + }); + } + } + } + + const S = (ms) => (ms == null ? "—" : (ms / 1000).toFixed(1) + "s"); + console.log(`\n${"═".repeat(78)}`); + console.log(`SPEED COMPARISON (multi-step wall-clock, median of ${TRIALS})`); + console.log( + "model".padEnd(20) + + "backend".padEnd(10) + + "scenario".padEnd(9) + + "steps".padStart(6) + + "1st-act".padStart(9) + + "total".padStart(9) + + " pass", + ); + for (const c of cells) { + console.log( + c.model.padEnd(20) + + c.backend.padEnd(10) + + c.scenario.padEnd(9) + + String(c.steps).padStart(6) + + S(c.firstMs).padStart(9) + + S(c.totalMs).padStart(9) + + ` ${c.passes}/${c.n}`, + ); + } + + console.log("\nPer-model median total (across scenarios/backends):"); + for (const m of models) { + const mine = cells.filter((c) => c.model === m); + console.log( + ` ${m.padEnd(20)} median total=${S(median(mine.map((c) => c.totalMs)))} median 1st-action=${S(median(mine.map((c) => c.firstMs)))}`, + ); + } + console.log(`${"═".repeat(78)}`); + process.exit(rows.every((r) => r.pass) ? 0 : 1); +} + +main().catch((e) => { + console.error(e); + process.exit(1); +}); diff --git a/scripts/bench-minimax.mjs b/scripts/bench-minimax.mjs new file mode 100644 index 00000000..7f668b33 --- /dev/null +++ b/scripts/bench-minimax.mjs @@ -0,0 +1,287 @@ +#!/usr/bin/env node +/** + * Quality + speed benchmark for the new MiniMax model on Ollama Cloud, measured + * on BOTH harness transports the app uses: + * - "anthropic" -> https://ollama.com (the Claude Agent SDK path) + * - "openai" -> https://ollama.com/v1 (the OpenCode path) + * + * For each (model, transport) it streams a realistic email-drafting task and + * records: + * - ttft_ms : wall time to the FIRST streamed token (incl. reasoning) + * - ttfv_ms : wall time to the first VISIBLE answer token (the draft; + * reasoning is hidden in the real app, so this is what a + * user actually waits for) + * - total_ms : wall time to completion + * - completion_tokens: from the provider's usage (incl. reasoning tokens) + * - tok_s : completion_tokens / generation time + * - reasoning? : whether the model emitted a chain-of-thought + * and prints the full draft so quality can be judged by eye. + * + * Usage: + * node scripts/bench-minimax.mjs + * node scripts/bench-minimax.mjs --models minimax-m3:cloud,kimi-k2.6:cloud + */ +import { dirname, join } from "node:path"; +import { fileURLToPath } from "node:url"; +import { loadEnv } from "./lib/load-env.mjs"; +import Anthropic from "@anthropic-ai/sdk"; + +const __dirname = dirname(fileURLToPath(import.meta.url)); + +loadEnv(join(__dirname, "..", ".env")); +loadEnv(join(__dirname, "..", ".env.local")); + +const KEY = process.env.OLLAMA_API_KEY; +if (!KEY) { + console.error("FAIL: OLLAMA_API_KEY required"); + process.exit(1); +} + +const argModels = process.argv.indexOf("--models"); +const MODELS = + argModels !== -1 && process.argv[argModels + 1] + ? process.argv[argModels + 1].split(",") + : [ + "minimax-m3:cloud", // the new model under test + "minimax-m2.7:cloud", // prior MiniMax for a within-family comparison + "kimi-k2.6:cloud", // the app's CURRENT default (baseline) + ]; + +// A realistic inbound thread + drafting instruction. No tools — this isolates +// pure generation quality and speed, which is what draft-generator.ts exercises. +const SYSTEM = + "You are an executive's email assistant. Write concise, warm, professional replies in the executive's voice. " + + "Return ONLY the reply body — no subject line, no preamble, no sign-off placeholders like [Name]."; +const USER = `Reply to this email. Decline the speaking slot politely but offer to send a short written Q&A instead, and ask for their deadline. + +From: Priya Nadar +Subject: Keynote invite — DevSummit 2026 (June 18, Lisbon) + +Hi Ankit, + +We'd love to have you keynote DevSummit 2026 in Lisbon on June 18. The slot is 45 minutes plus 15 of Q&A, audience ~1,200 engineers. We can cover travel and two nights' hotel. Could you let us know by next Friday if you're able to join? + +Thanks so much, +Priya`; + +// Mirror the app: llm-service.ts floors Ollama max_tokens at 4096 so reasoning +// models can finish their hidden chain-of-thought AND still emit the answer. +// (Cost is $0 on the Ollama subscription, so the ceiling is free.) +const MAX_TOKENS = 4096; +const argTrials = process.argv.indexOf("--trials"); +const TRIALS = argTrials !== -1 && process.argv[argTrials + 1] ? Number(process.argv[argTrials + 1]) : 3; + +function median(arr) { + const v = arr.filter((x) => x != null).sort((a, b) => a - b); + if (!v.length) return null; + const mid = Math.floor(v.length / 2); + return v.length % 2 ? v[mid] : (v[mid - 1] + v[mid]) / 2; +} + +// --- Anthropic-compatible transport (Claude Agent SDK harness path) --- +async function runAnthropic(model, t0) { + const client = new Anthropic({ baseURL: "https://ollama.com", authToken: KEY }); + let ttft = null; + let ttfv = null; + let answer = ""; + let reasoning = ""; + const stream = client.messages.stream({ + model, + max_tokens: MAX_TOKENS, + system: SYSTEM, + messages: [{ role: "user", content: USER }], + }); + stream.on("streamEvent", (ev) => { + if (ev.type === "content_block_delta") { + if (ttft === null) ttft = performance.now() - t0; + const d = ev.delta; + if (d.type === "text_delta") { + if (ttfv === null) ttfv = performance.now() - t0; + answer += d.text; + } else if (d.type === "thinking_delta") { + reasoning += d.thinking ?? ""; + } + } + }); + const final = await stream.finalMessage(); + const total = performance.now() - t0; + const completion = final.usage?.output_tokens ?? 0; + return { ttft, ttfv, total, completion, answer, reasoning }; +} + +// --- OpenAI-compatible transport (OpenCode harness path) --- +async function runOpenAI(model, t0) { + let ttft = null; + let ttfv = null; + let answer = ""; + let reasoning = ""; + let completion = 0; + const res = await fetch("https://ollama.com/v1/chat/completions", { + method: "POST", + headers: { Authorization: `Bearer ${KEY}`, "Content-Type": "application/json" }, + body: JSON.stringify({ + model, + max_tokens: MAX_TOKENS, + stream: true, + stream_options: { include_usage: true }, + messages: [ + { role: "system", content: SYSTEM }, + { role: "user", content: USER }, + ], + }), + }); + if (!res.ok) throw new Error(`HTTP ${res.status}: ${(await res.text()).slice(0, 200)}`); + const reader = res.body.getReader(); + const decoder = new TextDecoder(); + let buf = ""; + while (true) { + const { done, value } = await reader.read(); + if (done) break; + buf += decoder.decode(value, { stream: true }); + const lines = buf.split("\n"); + buf = lines.pop() ?? ""; + for (const line of lines) { + const s = line.trim(); + if (!s.startsWith("data:")) continue; + const payload = s.slice(5).trim(); + if (payload === "[DONE]") continue; + let json; + try { + json = JSON.parse(payload); + } catch { + continue; + } + if (json.usage?.completion_tokens != null) completion = json.usage.completion_tokens; + const delta = json.choices?.[0]?.delta; + if (!delta) continue; + const r = delta.reasoning ?? delta.reasoning_content; + if (r) { + if (ttft === null) ttft = performance.now() - t0; + reasoning += r; + } + if (delta.content) { + if (ttft === null) ttft = performance.now() - t0; + if (ttfv === null) ttfv = performance.now() - t0; + answer += delta.content; + } + } + } + const total = performance.now() - t0; + return { ttft, ttfv, total, completion, answer, reasoning }; +} + +async function bench(model, transport) { + const t0 = performance.now(); + const fn = transport === "anthropic" ? runAnthropic : runOpenAI; + const r = await fn(model, t0); + // Throughput over the FULL generation window (first token -> done), since + // completion_tokens counts reasoning + answer alike. + const genMs = r.ttft != null ? r.total - r.ttft : r.total; + const tokS = r.completion && genMs > 0 ? r.completion / (genMs / 1000) : 0; + // Hidden "thinking" latency = gap between first token and first visible answer. + const thinkMs = r.ttft != null && r.ttfv != null ? r.ttfv - r.ttft : null; + return { model, transport, ...r, tokS, thinkMs }; +} + +async function benchTrials(model, transport) { + const trials = []; + let lastDraft = null; + for (let i = 0; i < TRIALS; i++) { + try { + const r = await bench(model, transport); + trials.push(r); + if (r.answer?.trim()) lastDraft = r; + else if (!lastDraft) lastDraft = r; + } catch (e) { + trials.push({ model, transport, error: e.message }); + } + } + const ok = trials.filter((t) => !t.error); + const agg = { + model, + transport, + n: ok.length, + errors: trials.filter((t) => t.error).map((t) => t.error), + ttft: median(ok.map((t) => t.ttft)), + ttfv: median(ok.map((t) => t.ttfv)), + thinkMs: median(ok.map((t) => t.thinkMs)), + total: median(ok.map((t) => t.total)), + completion: median(ok.map((t) => t.completion)), + tokS: median(ok.map((t) => t.tokS)), + reasoning: ok.some((t) => t.reasoning), + answeredEvery: ok.length > 0 && ok.every((t) => t.answer?.trim()), + draft: lastDraft, + }; + return agg; +} + +function fmt(n, d = 0) { + return n == null ? "—" : n.toLocaleString("en-US", { maximumFractionDigits: d, minimumFractionDigits: d }); +} + +async function main() { + console.log("=== MiniMax quality + speed benchmark (Ollama Cloud) ==="); + console.log(`task: realistic email reply (decline keynote, offer Q&A)`); + console.log(`max_tokens=${MAX_TOKENS} (app floor) trials=${TRIALS} (median reported)`); + console.log(`models: ${MODELS.join(", ")}`); + console.log(""); + + const rows = []; + + // Primary comparison: all models on the Anthropic transport (Claude SDK path). + for (const model of MODELS) { + process.stdout.write(`[anthropic] ${model} ×${TRIALS} … `); + const r = await benchTrials(model, "anthropic"); + rows.push(r); + console.log( + `ttft=${fmt(r.ttft)} think=${fmt(r.thinkMs)} total=${fmt(r.total)} tok=${fmt(r.completion)} tok/s=${fmt(r.tokS, 1)}` + + `${r.answeredEvery ? "" : " ⚠ incomplete"}${r.errors.length ? ` ERR:${r.errors[0]}` : ""}`, + ); + } + + // Transport parity: the new model on the OpenCode (OpenAI-compat) transport. + process.stdout.write(`[openai ] minimax-m3:cloud ×${TRIALS} … `); + const oc = await benchTrials("minimax-m3:cloud", "openai"); + rows.push(oc); + console.log( + `ttft=${fmt(oc.ttft)} think=${fmt(oc.thinkMs)} total=${fmt(oc.total)} tok=${fmt(oc.completion)} tok/s=${fmt(oc.tokS, 1)}` + + `${oc.answeredEvery ? "" : " ⚠ incomplete"}${oc.errors.length ? ` ERR:${oc.errors[0]}` : ""}`, + ); + + console.log("\n================= SPEED (median of " + TRIALS + ") ================="); + const H = (s, w) => s.padStart(w); + console.log( + "model".padEnd(20) + + "transport".padEnd(11) + + H("ttft(ms)", 10) + + H("think(ms)", 11) + + H("total(ms)", 11) + + H("out_tok", 9) + + H("tok/s", 8), + ); + for (const r of rows) { + console.log( + r.model.padEnd(20) + + r.transport.padEnd(11) + + fmt(r.ttft).padStart(10) + + fmt(r.thinkMs).padStart(11) + + fmt(r.total).padStart(11) + + fmt(r.completion).padStart(9) + + fmt(r.tokS, 1).padStart(8), + ); + } + console.log("ttft=time to 1st token · think=1st token→1st visible answer · total=full draft ready"); + + console.log("\n================= DRAFTS (quality) ================="); + for (const r of rows) { + const d = r.draft; + console.log(`\n----- ${r.model} [${r.transport}]${r.reasoning ? " (reasoning emitted)" : ""} -----`); + console.log(d?.answer?.trim() || "(no visible answer produced within token budget)"); + } + console.log(""); +} + +main().catch((e) => { + console.error(e); + process.exit(1); +}); diff --git a/scripts/lib/load-env.mjs b/scripts/lib/load-env.mjs new file mode 100644 index 00000000..a37b5b27 --- /dev/null +++ b/scripts/lib/load-env.mjs @@ -0,0 +1,21 @@ +import { existsSync, readFileSync } from "node:fs"; + +/** + * Minimal .env loader shared by the local minimax/Ollama test + benchmark + * scripts. Parses `KEY=VALUE` lines (optional surrounding double-quotes, + * `#` comments) and does NOT overwrite keys already present in process.env, + * so an explicit `OLLAMA_API_KEY=… node script.mjs` still wins. + */ +export function loadEnv(path) { + if (!existsSync(path)) return; + for (const line of readFileSync(path, "utf8").split("\n")) { + const t = line.trim(); + if (!t || t.startsWith("#")) continue; + const eq = t.indexOf("="); + if (eq === -1) continue; + const key = t.slice(0, eq); + let val = t.slice(eq + 1); + if (val.startsWith('"') && val.endsWith('"')) val = val.slice(1, -1); + if (!(key in process.env)) process.env[key] = val; + } +} diff --git a/scripts/test-minimax-claude-sdk.mjs b/scripts/test-minimax-claude-sdk.mjs new file mode 100644 index 00000000..4b27e224 --- /dev/null +++ b/scripts/test-minimax-claude-sdk.mjs @@ -0,0 +1,173 @@ +#!/usr/bin/env node +/** + * End-to-end test: the NEW MiniMax model (minimax-m3:cloud) driven through the + * Claude Agent SDK harness routed to Ollama Cloud — i.e. the exact path + * ClaudeAgentProvider takes when `ollamaCloud.enabled` is true. + * + * This mirrors ClaudeAgentProvider.buildChildEnv() + the query() call in + * src/main/agents/providers/claude-agent-provider.ts: + * - ANTHROPIC_BASE_URL = https://ollama.com + * - ANTHROPIC_AUTH_TOKEN / ANTHROPIC_API_KEY = OLLAMA_API_KEY + * - every MODEL_ENV_VAR = minimax-m3:cloud + * - query() with an in-process MCP tool (createSdkMcpServer + tool) + * + * Verifies the harness can, against minimax-m3 on Ollama Cloud: + * • stream assistant text + * • invoke an MCP tool (agent tool-calling is the core of this harness) + * • reach a terminal result + * + * Usage: node scripts/test-minimax-claude-sdk.mjs [model] + * defaults to minimax-m3:cloud; pass another tag to compare. + */ +import { dirname, join } from "node:path"; +import { fileURLToPath } from "node:url"; +import { loadEnv } from "./lib/load-env.mjs"; +import { z } from "zod"; +import { query, tool, createSdkMcpServer } from "@anthropic-ai/claude-agent-sdk"; + +const __dirname = dirname(fileURLToPath(import.meta.url)); +const MODEL = process.argv[2] ?? "minimax-m3:cloud"; + +// MODEL_ENV_VARS — kept in lockstep with claude-agent-provider.ts. If Claude +// Code falls back to any hardcoded Anthropic model for a subtask, it 404s on +// ollama.com, so every one must point at our model. +const MODEL_ENV_VARS = [ + "ANTHROPIC_MODEL", + "ANTHROPIC_CUSTOM_MODEL", + "ANTHROPIC_DEFAULT_HAIKU_MODEL", + "ANTHROPIC_DEFAULT_SONNET_MODEL", + "ANTHROPIC_DEFAULT_OPUS_MODEL", + "ANTHROPIC_SMALL_FAST_MODEL", + "CLAUDE_CODE_SUBAGENT_MODEL", +]; + +loadEnv(join(__dirname, "..", ".env")); +loadEnv(join(__dirname, "..", ".env.local")); + +const OLLAMA_KEY = process.env.OLLAMA_API_KEY; +if (!OLLAMA_KEY) { + console.error("FAIL: OLLAMA_API_KEY is required (load .env first)"); + process.exit(1); +} + +// --- buildChildEnv() Ollama branch, replicated --- +const childEnv = { ...process.env }; +childEnv.ANTHROPIC_BASE_URL = "https://ollama.com"; +childEnv.ANTHROPIC_AUTH_TOKEN = OLLAMA_KEY; +childEnv.ANTHROPIC_API_KEY = OLLAMA_KEY; +for (const k of MODEL_ENV_VARS) childEnv[k] = MODEL; +childEnv.DISABLE_TELEMETRY = "1"; +childEnv.DISABLE_ERROR_REPORTING = "1"; +childEnv.CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC = "1"; +childEnv.DO_NOT_TRACK = "1"; +delete childEnv.CLAUDECODE; + +console.log("=== MiniMax × Claude Agent SDK × Ollama Cloud ==="); +console.log(`model: ${MODEL}`); +console.log(`base_url: ${childEnv.ANTHROPIC_BASE_URL}`); +console.log(`auth: ${OLLAMA_KEY.slice(0, 4)}…${OLLAMA_KEY.slice(-4)} (len=${OLLAMA_KEY.length})`); + +// --- in-process MCP tool, same mechanism the provider uses for mail tools --- +let toolCalled = false; +let toolArgs = null; +const mcpServer = createSdkMcpServer({ + name: "mail-app-tools", + version: "1.0.0", + tools: [ + tool( + "get_email", + "Returns the body of an email by id. Call this with id='42'.", + { id: z.string() }, + async (args) => { + toolCalled = true; + toolArgs = args; + return { + content: [ + { + type: "text", + text: JSON.stringify({ subject: "Quarterly sync", body: `Email ${args.id}: please confirm Tuesday 3pm.` }), + }, + ], + }; + }, + ), + ], +}); + +const PROMPT = + "You have an MCP tool `get_email`. Call it with id='42', then in plain text " + + "summarize the email in one sentence. End your reply with the literal token MINIMAX_SDK_PASS."; + +const q = query({ + prompt: PROMPT, + options: { + model: MODEL, + systemPrompt: "You are a QA agent. Follow instructions exactly. You MUST call the get_email tool.", + mcpServers: { "mail-app-tools": mcpServer }, + allowedTools: ["mcp__mail-app-tools__get_email"], + includePartialMessages: true, + maxTurns: 12, + permissionMode: "bypassPermissions", + settingSources: [], + persistSession: false, + env: childEnv, + stderr: (d) => process.stderr.write(`[claude-stderr] ${d}`), + }, +}); + +const toolCalls = []; +const textOut = []; +let resultMeta = null; + +const timeout = setTimeout(() => { + console.error("\nFAIL: 120s timeout waiting for result"); + process.exit(3); +}, 120_000); + +try { + for await (const msg of q) { + if (msg.type === "system" && msg.subtype === "init") { + console.log(`[init] session started; model=${msg.model ?? MODEL}`); + } + if (msg.type === "assistant") { + for (const block of msg.message.content ?? []) { + if (block.type === "tool_use") { + toolCalls.push(block.name); + console.log(`\n[tool_use] ${block.name} ${JSON.stringify(block.input)}`); + } else if (block.type === "text" && block.text) { + textOut.push(block.text); + process.stdout.write(block.text); + } + } + } + if (msg.type === "result") { + resultMeta = msg; + console.log( + `\n[result] subtype=${msg.subtype} turns=${msg.num_turns ?? "?"} duration_ms=${msg.duration_ms ?? "?"}`, + ); + } + } +} catch (err) { + clearTimeout(timeout); + console.error(`\nFAIL: query threw: ${err instanceof Error ? err.stack : String(err)}`); + process.exit(1); +} +clearTimeout(timeout); + +const finalText = textOut.join(""); +const sdkSawToolCall = toolCalls.some((t) => t.endsWith("get_email")); +const sawText = finalText.trim().length > 0; +const sawToken = finalText.includes("MINIMAX_SDK_PASS"); +const resultOk = resultMeta?.subtype === "success" || resultMeta != null; + +const pass = toolCalled && sdkSawToolCall && sawText && resultOk; + +console.log("\n========================="); +console.log(`tool executed (handler ran): ${toolCalled} args=${JSON.stringify(toolArgs)}`); +console.log(`tool_use seen by SDK: ${sdkSawToolCall}`); +console.log(`streamed assistant text: ${sawText}`); +console.log(`MINIMAX_SDK_PASS token: ${sawToken}`); +console.log(`reached result: ${resultOk} (${resultMeta?.subtype ?? "none"})`); +console.log(`VERDICT: ${pass ? "PASS ✅" : "FAIL ❌"}`); +console.log("========================="); +process.exit(pass ? 0 : 1); diff --git a/scripts/test-minimax-opencode.mjs b/scripts/test-minimax-opencode.mjs new file mode 100644 index 00000000..df207f28 --- /dev/null +++ b/scripts/test-minimax-opencode.mjs @@ -0,0 +1,250 @@ +#!/usr/bin/env node +/** + * End-to-end test: the NEW MiniMax model (minimax-m3:cloud) driven through the + * OpenCode harness routed to Ollama Cloud — i.e. the exact path + * OpenCodeAgentProvider takes when `ollamaCloud.enabled` is true. + * + * Mirrors OpenCodeAgentProvider.buildOpencodeConfig() + resolveRoute() in + * src/main/agents/providers/opencode/opencode-agent-provider.ts: + * - provider "ollama-cloud" via @ai-sdk/openai-compatible + * baseURL https://ollama.com/v1, apiKey = OLLAMA_API_KEY + * - models registered with tool_call: true + * - session.prompt routed to { providerID: "ollama-cloud", modelID: } + * - a remote MCP bridge exposing one fake mail tool + * + * Verifies, against minimax-m3 on Ollama Cloud through OpenCode: + * • streaming text deltas + * • the MCP tool getting called + * • a terminal session.idle + * + * Usage: node scripts/test-minimax-opencode.mjs [model] + * defaults to minimax-m3:cloud. + */ +import { dirname, join } from "node:path"; +import { fileURLToPath } from "node:url"; +import { loadEnv } from "./lib/load-env.mjs"; +import { z } from "zod"; + +const __dirname = dirname(fileURLToPath(import.meta.url)); +const MODEL = process.argv[2] ?? "minimax-m3:cloud"; + +loadEnv(join(__dirname, "..", ".env")); +loadEnv(join(__dirname, "..", ".env.local")); + +const OLLAMA_KEY = process.env.OLLAMA_API_KEY; +if (!OLLAMA_KEY) { + console.error("FAIL: OLLAMA_API_KEY is required (load .env first)"); + process.exit(1); +} + +// Prepend node_modules/.bin so the OpenCode SDK finds the `opencode` binary. +const binDir = join(__dirname, "..", "node_modules", ".bin"); +process.env.PATH = `${binDir}:${process.env.PATH ?? ""}`; + +const { createOpencodeServer } = await import("@opencode-ai/sdk"); +const { createOpencodeClient } = await import("@opencode-ai/sdk/client"); +const { McpServer } = await import("@modelcontextprotocol/sdk/server/mcp.js"); +const { StreamableHTTPServerTransport } = await import( + "@modelcontextprotocol/sdk/server/streamableHttp.js" +); +const { createServer } = await import("node:http"); +const { randomUUID } = await import("node:crypto"); + +console.log("=== MiniMax × OpenCode × Ollama Cloud ==="); +console.log(`model: ollama-cloud/${MODEL}`); +console.log(`base_url: https://ollama.com/v1`); +console.log(`auth: ${OLLAMA_KEY.slice(0, 4)}…${OLLAMA_KEY.slice(-4)} (len=${OLLAMA_KEY.length})`); + +// --- Step 1: MCP bridge with one fake mail tool (mirrors McpBridge) --- +let toolWasCalled = false; +let toolArgsSeen = null; + +const mcp = new McpServer({ name: "mail-app-tools", version: "1.0.0" }); +mcp.registerTool( + "get_email", + { + description: "Returns the body of an email by id. Call this with id='42'.", + inputSchema: { id: z.string() }, + }, + async (args) => { + toolWasCalled = true; + toolArgsSeen = args; + return { + content: [ + { + type: "text", + text: JSON.stringify({ subject: "Quarterly sync", body: `Email ${args.id}: please confirm Tuesday 3pm.` }), + }, + ], + }; + }, +); + +const transport = new StreamableHTTPServerTransport({ sessionIdGenerator: () => randomUUID() }); +await mcp.connect(transport); + +const httpServer = createServer((req, res) => { + const chunks = []; + req.on("data", (c) => chunks.push(c)); + req.on("end", () => { + const bodyStr = Buffer.concat(chunks).toString("utf8"); + let body; + if (bodyStr) { + try { + body = JSON.parse(bodyStr); + } catch { + /* ignore */ + } + } + transport.handleRequest(req, res, body).catch((err) => console.error("MCP handler error:", err)); + }); +}); +await new Promise((resolve, reject) => { + httpServer.once("error", reject); + httpServer.listen(0, "127.0.0.1", () => { + httpServer.off("error", reject); + resolve(); + }); +}); +const bridgeUrl = `http://127.0.0.1:${httpServer.address().port}/mcp`; +console.log(`[oc] MCP bridge: ${bridgeUrl}`); + +// --- Step 2: OpenCode server config — buildOpencodeConfig() Ollama branch --- +const ocConfig = { + logLevel: "WARN", + mcp: { + "mail-app-tools": { type: "remote", url: bridgeUrl, enabled: true }, + }, + permission: { edit: "allow", bash: "allow", webfetch: "allow" }, + // ollama active, anthropic inactive -> anthropic added to disabled list + disabled_providers: ["github-copilot", "openrouter", "google", "groq", "deepseek", "anthropic"], + provider: { + "ollama-cloud": { + name: "Ollama Cloud", + npm: "@ai-sdk/openai-compatible", + options: { + baseURL: "https://ollama.com/v1", + apiKey: OLLAMA_KEY, + }, + models: { + [MODEL]: { id: MODEL, name: MODEL, tool_call: true }, + }, + }, + }, +}; + +console.log("[oc] starting OpenCode server (may install @ai-sdk/openai-compatible on first run)…"); +const server = await createOpencodeServer({ + hostname: "127.0.0.1", + port: 0, + timeout: 60_000, + config: ocConfig, +}); +console.log(`[oc] server: ${server.url}`); +const client = createOpencodeClient({ baseUrl: server.url }); + +try { + const ids = await client.tool.ids({ query: { directory: process.cwd() } }); + const idList = Array.isArray(ids.data) ? ids.data : []; + if (!idList.some((t) => /get_email/.test(t))) { + console.warn(`[oc] WARNING: get_email not in tool list yet (may load lazily)`); + } else { + console.log(`[oc] tool registered: get_email`); + } +} catch (err) { + console.warn(`[oc] tool.ids check failed: ${err?.message ?? err}`); +} + +// --- Step 3: session + prompt routed to ollama-cloud/minimax --- +const session = await client.session.create({ body: { title: "minimax-oc-smoke" } }); +const sessionId = session.data?.id; +if (!sessionId) { + console.error("FAIL: session.create returned no id"); + process.exit(2); +} + +const abort = new AbortController(); +const eventResult = await client.event.subscribe({ signal: abort.signal }); +const eventIter = eventResult.stream[Symbol.asyncIterator](); + +const PROMPT = + "You have an MCP tool `get_email`. Call it with id='42', then in plain text " + + "summarize the email in one sentence. End your reply with the literal token MINIMAX_OC_PASS."; + +const promptPromise = client.session + .promptAsync({ + path: { id: sessionId }, + body: { + model: { providerID: "ollama-cloud", modelID: MODEL }, + system: "You are a QA agent. Follow instructions exactly. You MUST use the get_email tool.", + tools: { write: false, edit: false, read: false, glob: false, grep: false, bash: false }, + parts: [{ type: "text", text: PROMPT }], + }, + }) + .catch((err) => { + console.error(`[oc] promptAsync rejected: ${err?.message ?? err}`); + abort.abort(); + }); + +let textOut = ""; +const toolCallsSeen = []; +let sessionErrored = null; +const overallTimeout = setTimeout(() => { + console.error("\n[oc] FAIL: 150s timeout waiting for session.idle"); + abort.abort(); +}, 150_000); + +while (true) { + const step = await eventIter.next(); + if (step.done) break; + const ev = step.value; + + if (ev.type === "message.part.updated") { + const part = ev.properties.part; + if (part.type === "text" && part.sessionID === sessionId) { + if (ev.properties.delta) { + process.stdout.write(ev.properties.delta); + textOut += ev.properties.delta; + } else if (part.text && part.text.length > textOut.length && part.text.startsWith(textOut)) { + const delta = part.text.slice(textOut.length); + process.stdout.write(delta); + textOut = part.text; + } + } else if (part.type === "tool" && part.sessionID === sessionId) { + if (!toolCallsSeen.includes(part.callID)) { + toolCallsSeen.push(part.callID); + console.log(`\n[oc] tool_call_start: ${part.tool}`); + } + if (part.state?.status === "completed") console.log(`[oc] tool_call_end: ${part.tool}`); + } + } + if (ev.type === "session.idle" && ev.properties.sessionID === sessionId) { + console.log("\n[oc] session.idle"); + break; + } + if (ev.type === "session.error" && ev.properties.sessionID === sessionId) { + sessionErrored = ev.properties.error; + console.error("\n[oc] session.error", JSON.stringify(ev.properties.error)); + break; + } +} + +clearTimeout(overallTimeout); +abort.abort(); +await promptPromise; +server.close(); +httpServer.close(); + +// --- Step 4: verdict --- +const sawText = textOut.trim().length > 0; +const sawToken = textOut.includes("MINIMAX_OC_PASS"); +const pass = toolWasCalled && sawText && !sessionErrored; + +console.log("\n========================="); +console.log(`tool executed (handler ran): ${toolWasCalled} args=${JSON.stringify(toolArgsSeen)}`); +console.log(`streamed assistant text: ${sawText}`); +console.log(`MINIMAX_OC_PASS token: ${sawToken}`); +console.log(`session.error: ${sessionErrored ? JSON.stringify(sessionErrored) : "none"}`); +console.log(`VERDICT: ${pass ? "PASS ✅" : "FAIL ❌"}`); +console.log("========================="); +process.exit(pass ? 0 : 1); diff --git a/src/renderer/components/ExtensionsTab.tsx b/src/renderer/components/ExtensionsTab.tsx index b43670d0..b07d8d6e 100644 --- a/src/renderer/components/ExtensionsTab.tsx +++ b/src/renderer/components/ExtensionsTab.tsx @@ -5,6 +5,7 @@ import type { SettingDefinition, } from "../../shared/extension-types"; import { DEFAULT_OLLAMA_MODEL } from "../../shared/types"; +import { OllamaModelSelect } from "./OllamaModelSelect"; import { loadExtensionRenderer } from "../extensions/installed-extensions"; // useStore not needed — OpenClaw config uses window.api.settings directly @@ -614,12 +615,12 @@ export function ExtensionsTab() { - setOllamaCloudModel(e.target.value)} + onChange={setOllamaCloudModel} + ariaLabel="Default Ollama Cloud model" + selectClassName="w-full px-3 py-2 text-sm border border-gray-300 dark:border-gray-600 rounded-lg bg-white dark:bg-gray-700 text-gray-900 dark:text-gray-100" + inputClassName="w-full px-3 py-2 text-sm border border-gray-300 dark:border-gray-600 rounded-lg bg-white dark:bg-gray-700 text-gray-900 dark:text-gray-100 placeholder-gray-400" /> diff --git a/src/renderer/components/OllamaModelSelect.tsx b/src/renderer/components/OllamaModelSelect.tsx new file mode 100644 index 00000000..56274a56 --- /dev/null +++ b/src/renderer/components/OllamaModelSelect.tsx @@ -0,0 +1,82 @@ +import { useEffect, useState } from "react"; +import { COMMON_OLLAMA_MODELS, DEFAULT_OLLAMA_MODEL } from "../../shared/types"; + +/** Sentinel