From b68a586a92eabaa129e1919aa1c440a119c52a3d Mon Sep 17 00:00:00 2001 From: Ankit Gupta Date: Tue, 16 Jun 2026 16:43:14 -0400 Subject: [PATCH 1/2] feat: make GLM 5.2 the default Ollama Cloud model Switch DEFAULT_OLLAMA_MODEL from kimi-k2.6:cloud to glm-5.2:cloud. Validated with a 16-task agent-sidebar benchmark (real test inbox, interleaved, warm, auto-draft disabled) against kimi-k2.7-code:cloud. GLM 5.2 was the faster model in both runs (faster on 10/16 then 13/16) at parity final-answer quality. Both models clearly beat the prior kimi-k2.6 default. Reasoning is returned in proper `thinking` blocks on Ollama's Anthropic-compat endpoint, so chain-of- thought does not leak into drafts. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/shared/types.ts | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/src/shared/types.ts b/src/shared/types.ts index ed28e3ab..61dd52b0 100644 --- a/src/shared/types.ts +++ b/src/shared/types.ts @@ -368,17 +368,22 @@ export type SenderLookupProvider = z.infer; /** * Default Ollama Cloud model when none is configured. * - * kimi-k2.6:cloud is Moonshot's latest. Chosen over deepseek-v4-pro:cloud - * because deepseek's compat-layer thinking depth is capped at default-level - * (no way to invoke "max" through Ollama's Anthropic-compat endpoint — - * filed upstream at ollama/ollama#15952), which made the agent feel - * underpowered. kimi-k2.6 doesn't need a thinking knob to perform well. + * glm-5.2:cloud (z.ai, 744B MoE, 1M context). Chosen after a 16-task + * agent-sidebar benchmark against kimi-k2.7-code:cloud: GLM 5.2 was + * consistently the faster model at parity final-answer quality (both clearly + * beat the prior kimi-k2.6 default). Reasoning is returned in proper + * `thinking` blocks on Ollama's Anthropic-compat endpoint (the agent path), + * so chain-of-thought does not leak into drafts; non-agent features use the + * native /api/chat path with think:true, which likewise keeps CoT out of + * parsed JSON. Note: GLM emits brief inter-turn progress text in agent loops + * (visible as status lines in the sidebar) — suppressible via the agent + * system prompt if undesired. * * Note: cloud models may occasionally return overloaded_error during peak * traffic — llm-service.ts retry logic catches this via Anthropic.APIError * status 529 (rate_limit category) and backs off automatically. */ -export const DEFAULT_OLLAMA_MODEL = "kimi-k2.6:cloud"; +export const DEFAULT_OLLAMA_MODEL = "glm-5.2:cloud"; export const OllamaCloudConfigSchema = z.object({ apiKey: z.string().default(""), From 226806d836b6e10edb227ec194934b84ff9bf070 Mon Sep 17 00:00:00 2001 From: Ankit Gupta Date: Tue, 16 Jun 2026 17:17:36 -0400 Subject: [PATCH 2/2] fix(evals): eval harness used retired default model (claude-sonnet-4-20250514) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The eval runners constructed services with no model argument, falling back to the retired legacy constructor default `claude-sonnet-4-20250514`, which now 404s — breaking `npm run eval` (and the pre-pr eval gate) for everyone. The app itself is unaffected because it passes the resolved live model. Pass each eval service the app's actual default model via resolveModelId(DEFAULT_MODEL_CONFIG[feature]) so the eval tracks real default resolution and can't rot when a model id is retired: - runner.ts (analyzer) - features/draft-generator.ts - features/calendaring-agent.ts - features/archive-ready-analyzer.ts Verified: analyzer eval 10/10 no regressions; feature evals pass with +0.0 deltas vs baseline. Co-Authored-By: Claude Opus 4.8 (1M context) --- tests/evals/features/archive-ready-analyzer.ts | 3 ++- tests/evals/features/calendaring-agent.ts | 3 ++- tests/evals/features/draft-generator.ts | 3 ++- tests/evals/runner.ts | 5 ++++- 4 files changed, 10 insertions(+), 4 deletions(-) diff --git a/tests/evals/features/archive-ready-analyzer.ts b/tests/evals/features/archive-ready-analyzer.ts index 6d8ff048..5e8537ad 100644 --- a/tests/evals/features/archive-ready-analyzer.ts +++ b/tests/evals/features/archive-ready-analyzer.ts @@ -7,6 +7,7 @@ */ import { ArchiveReadyAnalyzer } from "../../../src/main/services/archive-ready-analyzer"; import type { DashboardEmail } from "../../../src/shared/types"; +import { resolveModelId, DEFAULT_MODEL_CONFIG } from "../../../src/shared/types"; interface ArchiveReadyFixtureInput { thread: DashboardEmail[]; @@ -28,7 +29,7 @@ export async function runArchiveReadyFixture( `[archive-ready] fixture ${fixtureId}: input must be { thread: DashboardEmail[], userEmail? }`, ); } - const analyzer = new ArchiveReadyAnalyzer(); + const analyzer = new ArchiveReadyAnalyzer(resolveModelId(DEFAULT_MODEL_CONFIG.archiveReady)); const result = await analyzer.analyzeThread(input.thread, input.userEmail); return JSON.stringify(result, null, 2); } diff --git a/tests/evals/features/calendaring-agent.ts b/tests/evals/features/calendaring-agent.ts index 8104f9f4..66f7e2ef 100644 --- a/tests/evals/features/calendaring-agent.ts +++ b/tests/evals/features/calendaring-agent.ts @@ -7,6 +7,7 @@ */ import { CalendaringAgent } from "../../../src/main/services/calendaring-agent"; import type { Email } from "../../../src/shared/types"; +import { resolveModelId, DEFAULT_MODEL_CONFIG } from "../../../src/shared/types"; interface CalendaringFixtureInput { email: Email; @@ -28,7 +29,7 @@ export async function runCalendaringFixture( if (!isInput(input)) { throw new Error(`[calendaring-agent] fixture ${fixtureId}: input must be { email }`); } - const agent = new CalendaringAgent(); + const agent = new CalendaringAgent(resolveModelId(DEFAULT_MODEL_CONFIG.calendaring)); const result = await agent.analyze(input.email); return JSON.stringify(result, null, 2); } diff --git a/tests/evals/features/draft-generator.ts b/tests/evals/features/draft-generator.ts index 19646894..2efc1f4d 100644 --- a/tests/evals/features/draft-generator.ts +++ b/tests/evals/features/draft-generator.ts @@ -10,6 +10,7 @@ */ import { DraftGenerator } from "../../../src/main/services/draft-generator"; +import { resolveModelId, DEFAULT_MODEL_CONFIG } from "../../../src/shared/types"; import type { Email, AnalysisResult } from "../../../src/shared/types"; interface DraftGeneratorFixtureInput { @@ -42,7 +43,7 @@ export async function runDraftGeneratorFixture( // ships, not a test-only configuration. EA + sender lookup are off so // we isolate the draft-generation behavior; those flows have their // own (TODO) eval suites. - const generator = new DraftGenerator(); + const generator = new DraftGenerator(resolveModelId(DEFAULT_MODEL_CONFIG.drafts)); const response = await generator.generateDraft(input.email, input.analysis, undefined, { enableSenderLookup: false, }); diff --git a/tests/evals/runner.ts b/tests/evals/runner.ts index dc332348..731a2bc3 100644 --- a/tests/evals/runner.ts +++ b/tests/evals/runner.ts @@ -13,6 +13,7 @@ import { readFileSync, readdirSync, writeFileSync } from "fs"; import { join } from "path"; import { EmailAnalyzer } from "../../src/main/services/email-analyzer"; import type { Email, AnalysisResult } from "../../src/shared/types"; +import { resolveModelId, DEFAULT_MODEL_CONFIG } from "../../src/shared/types"; import { scoreDeterministic, type DeterministicResult, @@ -74,7 +75,9 @@ function saveBaseline(scores: Record): void { // --- Runner --- async function runEval(fixtures: EvalFixture[]): Promise { - const analyzer = new EmailAnalyzer(); + // Pass the app's actual default analysis model. Constructing with no args + // falls back to the retired legacy default (claude-sonnet-4-20250514 → 404). + const analyzer = new EmailAnalyzer(resolveModelId(DEFAULT_MODEL_CONFIG.analysis)); const results: EvalReport["results"] = []; const regressions: string[] = []; const baseline = loadBaseline();