ankitvgupta · ankitvgupta · Jun 16, 2026 · Jun 16, 2026 · Jun 16, 2026
diff --git a/src/shared/types.ts b/src/shared/types.ts
@@ -368,17 +368,22 @@ export type SenderLookupProvider = z.infer<typeof SenderLookupProviderSchema>;
 /**
  * Default Ollama Cloud model when none is configured.
  *
- * kimi-k2.6:cloud is Moonshot's latest. Chosen over deepseek-v4-pro:cloud
- * because deepseek's compat-layer thinking depth is capped at default-level
- * (no way to invoke "max" through Ollama's Anthropic-compat endpoint —
- * filed upstream at ollama/ollama#15952), which made the agent feel
- * underpowered. kimi-k2.6 doesn't need a thinking knob to perform well.
+ * glm-5.2:cloud (z.ai, 744B MoE, 1M context). Chosen after a 16-task
+ * agent-sidebar benchmark against kimi-k2.7-code:cloud: GLM 5.2 was
+ * consistently the faster model at parity final-answer quality (both clearly
+ * beat the prior kimi-k2.6 default). Reasoning is returned in proper
+ * `thinking` blocks on Ollama's Anthropic-compat endpoint (the agent path),
+ * so chain-of-thought does not leak into drafts; non-agent features use the
+ * native /api/chat path with think:true, which likewise keeps CoT out of
+ * parsed JSON. Note: GLM emits brief inter-turn progress text in agent loops
+ * (visible as status lines in the sidebar) — suppressible via the agent
+ * system prompt if undesired.
  *
  * Note: cloud models may occasionally return overloaded_error during peak
  * traffic — llm-service.ts retry logic catches this via Anthropic.APIError
  * status 529 (rate_limit category) and backs off automatically.
  */
-export const DEFAULT_OLLAMA_MODEL = "kimi-k2.6:cloud";
+export const DEFAULT_OLLAMA_MODEL = "glm-5.2:cloud";
 
 export const OllamaCloudConfigSchema = z.object({
   apiKey: z.string().default(""),

diff --git a/tests/evals/features/archive-ready-analyzer.ts b/tests/evals/features/archive-ready-analyzer.ts
@@ -7,6 +7,7 @@
  */
 import { ArchiveReadyAnalyzer } from "../../../src/main/services/archive-ready-analyzer";
 import type { DashboardEmail } from "../../../src/shared/types";
+import { resolveModelId, DEFAULT_MODEL_CONFIG } from "../../../src/shared/types";
 
 interface ArchiveReadyFixtureInput {
   thread: DashboardEmail[];
@@ -28,7 +29,7 @@ export async function runArchiveReadyFixture(
       `[archive-ready] fixture ${fixtureId}: input must be { thread: DashboardEmail[], userEmail? }`,
     );
   }
-  const analyzer = new ArchiveReadyAnalyzer();
+  const analyzer = new ArchiveReadyAnalyzer(resolveModelId(DEFAULT_MODEL_CONFIG.archiveReady));
   const result = await analyzer.analyzeThread(input.thread, input.userEmail);
   return JSON.stringify(result, null, 2);
 }
diff --git a/tests/evals/features/calendaring-agent.ts b/tests/evals/features/calendaring-agent.ts
@@ -7,6 +7,7 @@
  */
 import { CalendaringAgent } from "../../../src/main/services/calendaring-agent";
 import type { Email } from "../../../src/shared/types";
+import { resolveModelId, DEFAULT_MODEL_CONFIG } from "../../../src/shared/types";
 
 interface CalendaringFixtureInput {
   email: Email;
@@ -28,7 +29,7 @@ export async function runCalendaringFixture(
   if (!isInput(input)) {
     throw new Error(`[calendaring-agent] fixture ${fixtureId}: input must be { email }`);
   }
-  const agent = new CalendaringAgent();
+  const agent = new CalendaringAgent(resolveModelId(DEFAULT_MODEL_CONFIG.calendaring));
   const result = await agent.analyze(input.email);
   return JSON.stringify(result, null, 2);
 }
diff --git a/tests/evals/features/draft-generator.ts b/tests/evals/features/draft-generator.ts
@@ -10,6 +10,7 @@
  */
 
 import { DraftGenerator } from "../../../src/main/services/draft-generator";
+import { resolveModelId, DEFAULT_MODEL_CONFIG } from "../../../src/shared/types";
 import type { Email, AnalysisResult } from "../../../src/shared/types";
 
 interface DraftGeneratorFixtureInput {
@@ -42,7 +43,7 @@ export async function runDraftGeneratorFixture(
   // ships, not a test-only configuration. EA + sender lookup are off so
   // we isolate the draft-generation behavior; those flows have their
   // own (TODO) eval suites.
-  const generator = new DraftGenerator();
+  const generator = new DraftGenerator(resolveModelId(DEFAULT_MODEL_CONFIG.drafts));
   const response = await generator.generateDraft(input.email, input.analysis, undefined, {
     enableSenderLookup: false,
   });

diff --git a/tests/evals/runner.ts b/tests/evals/runner.ts
@@ -13,6 +13,7 @@ import { readFileSync, readdirSync, writeFileSync } from "fs";
 import { join } from "path";
 import { EmailAnalyzer } from "../../src/main/services/email-analyzer";
 import type { Email, AnalysisResult } from "../../src/shared/types";
+import { resolveModelId, DEFAULT_MODEL_CONFIG } from "../../src/shared/types";
 import {
   scoreDeterministic,
   type DeterministicResult,
@@ -74,7 +75,9 @@ function saveBaseline(scores: Record<string, number>): void {
 // --- Runner ---
 
 async function runEval(fixtures: EvalFixture[]): Promise<EvalReport> {
-  const analyzer = new EmailAnalyzer();
+  // Pass the app's actual default analysis model. Constructing with no args
+  // falls back to the retired legacy default (claude-sonnet-4-20250514 → 404).
+  const analyzer = new EmailAnalyzer(resolveModelId(DEFAULT_MODEL_CONFIG.analysis));
   const results: EvalReport["results"] = [];
   const regressions: string[] = [];
   const baseline = loadBaseline();