diff --git a/src/shared/types.ts b/src/shared/types.ts index ed28e3ab..61dd52b0 100644 --- a/src/shared/types.ts +++ b/src/shared/types.ts @@ -368,17 +368,22 @@ export type SenderLookupProvider = z.infer; /** * Default Ollama Cloud model when none is configured. * - * kimi-k2.6:cloud is Moonshot's latest. Chosen over deepseek-v4-pro:cloud - * because deepseek's compat-layer thinking depth is capped at default-level - * (no way to invoke "max" through Ollama's Anthropic-compat endpoint — - * filed upstream at ollama/ollama#15952), which made the agent feel - * underpowered. kimi-k2.6 doesn't need a thinking knob to perform well. + * glm-5.2:cloud (z.ai, 744B MoE, 1M context). Chosen after a 16-task + * agent-sidebar benchmark against kimi-k2.7-code:cloud: GLM 5.2 was + * consistently the faster model at parity final-answer quality (both clearly + * beat the prior kimi-k2.6 default). Reasoning is returned in proper + * `thinking` blocks on Ollama's Anthropic-compat endpoint (the agent path), + * so chain-of-thought does not leak into drafts; non-agent features use the + * native /api/chat path with think:true, which likewise keeps CoT out of + * parsed JSON. Note: GLM emits brief inter-turn progress text in agent loops + * (visible as status lines in the sidebar) — suppressible via the agent + * system prompt if undesired. * * Note: cloud models may occasionally return overloaded_error during peak * traffic — llm-service.ts retry logic catches this via Anthropic.APIError * status 529 (rate_limit category) and backs off automatically. */ -export const DEFAULT_OLLAMA_MODEL = "kimi-k2.6:cloud"; +export const DEFAULT_OLLAMA_MODEL = "glm-5.2:cloud"; export const OllamaCloudConfigSchema = z.object({ apiKey: z.string().default(""), diff --git a/tests/evals/features/archive-ready-analyzer.ts b/tests/evals/features/archive-ready-analyzer.ts index 6d8ff048..5e8537ad 100644 --- a/tests/evals/features/archive-ready-analyzer.ts +++ b/tests/evals/features/archive-ready-analyzer.ts @@ -7,6 +7,7 @@ */ import { ArchiveReadyAnalyzer } from "../../../src/main/services/archive-ready-analyzer"; import type { DashboardEmail } from "../../../src/shared/types"; +import { resolveModelId, DEFAULT_MODEL_CONFIG } from "../../../src/shared/types"; interface ArchiveReadyFixtureInput { thread: DashboardEmail[]; @@ -28,7 +29,7 @@ export async function runArchiveReadyFixture( `[archive-ready] fixture ${fixtureId}: input must be { thread: DashboardEmail[], userEmail? }`, ); } - const analyzer = new ArchiveReadyAnalyzer(); + const analyzer = new ArchiveReadyAnalyzer(resolveModelId(DEFAULT_MODEL_CONFIG.archiveReady)); const result = await analyzer.analyzeThread(input.thread, input.userEmail); return JSON.stringify(result, null, 2); } diff --git a/tests/evals/features/calendaring-agent.ts b/tests/evals/features/calendaring-agent.ts index 8104f9f4..66f7e2ef 100644 --- a/tests/evals/features/calendaring-agent.ts +++ b/tests/evals/features/calendaring-agent.ts @@ -7,6 +7,7 @@ */ import { CalendaringAgent } from "../../../src/main/services/calendaring-agent"; import type { Email } from "../../../src/shared/types"; +import { resolveModelId, DEFAULT_MODEL_CONFIG } from "../../../src/shared/types"; interface CalendaringFixtureInput { email: Email; @@ -28,7 +29,7 @@ export async function runCalendaringFixture( if (!isInput(input)) { throw new Error(`[calendaring-agent] fixture ${fixtureId}: input must be { email }`); } - const agent = new CalendaringAgent(); + const agent = new CalendaringAgent(resolveModelId(DEFAULT_MODEL_CONFIG.calendaring)); const result = await agent.analyze(input.email); return JSON.stringify(result, null, 2); } diff --git a/tests/evals/features/draft-generator.ts b/tests/evals/features/draft-generator.ts index 19646894..2efc1f4d 100644 --- a/tests/evals/features/draft-generator.ts +++ b/tests/evals/features/draft-generator.ts @@ -10,6 +10,7 @@ */ import { DraftGenerator } from "../../../src/main/services/draft-generator"; +import { resolveModelId, DEFAULT_MODEL_CONFIG } from "../../../src/shared/types"; import type { Email, AnalysisResult } from "../../../src/shared/types"; interface DraftGeneratorFixtureInput { @@ -42,7 +43,7 @@ export async function runDraftGeneratorFixture( // ships, not a test-only configuration. EA + sender lookup are off so // we isolate the draft-generation behavior; those flows have their // own (TODO) eval suites. - const generator = new DraftGenerator(); + const generator = new DraftGenerator(resolveModelId(DEFAULT_MODEL_CONFIG.drafts)); const response = await generator.generateDraft(input.email, input.analysis, undefined, { enableSenderLookup: false, }); diff --git a/tests/evals/runner.ts b/tests/evals/runner.ts index dc332348..731a2bc3 100644 --- a/tests/evals/runner.ts +++ b/tests/evals/runner.ts @@ -13,6 +13,7 @@ import { readFileSync, readdirSync, writeFileSync } from "fs"; import { join } from "path"; import { EmailAnalyzer } from "../../src/main/services/email-analyzer"; import type { Email, AnalysisResult } from "../../src/shared/types"; +import { resolveModelId, DEFAULT_MODEL_CONFIG } from "../../src/shared/types"; import { scoreDeterministic, type DeterministicResult, @@ -74,7 +75,9 @@ function saveBaseline(scores: Record): void { // --- Runner --- async function runEval(fixtures: EvalFixture[]): Promise { - const analyzer = new EmailAnalyzer(); + // Pass the app's actual default analysis model. Constructing with no args + // falls back to the retired legacy default (claude-sonnet-4-20250514 → 404). + const analyzer = new EmailAnalyzer(resolveModelId(DEFAULT_MODEL_CONFIG.analysis)); const results: EvalReport["results"] = []; const regressions: string[] = []; const baseline = loadBaseline();