From b68a586a92eabaa129e1919aa1c440a119c52a3d Mon Sep 17 00:00:00 2001
From: Ankit Gupta <ankit@ycombinator.com>
Date: Tue, 16 Jun 2026 16:43:14 -0400
Subject: [PATCH 1/2] feat: make GLM 5.2 the default Ollama Cloud model

Switch DEFAULT_OLLAMA_MODEL from kimi-k2.6:cloud to glm-5.2:cloud.

Validated with a 16-task agent-sidebar benchmark (real test inbox, interleaved,
warm, auto-draft disabled) against kimi-k2.7-code:cloud. GLM 5.2 was the faster
model in both runs (faster on 10/16 then 13/16) at parity final-answer quality.
Both models clearly beat the prior kimi-k2.6 default. Reasoning is returned in
proper `thinking` blocks on Ollama's Anthropic-compat endpoint, so chain-of-
thought does not leak into drafts.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/shared/types.ts | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)
diff --git a/src/shared/types.ts b/src/shared/types.ts
index ed28e3ab..61dd52b0 100644
--- a/src/shared/types.ts
+++ b/src/shared/types.ts
@@ -368,17 +368,22 @@ export type SenderLookupProvider = z.infer<typeof SenderLookupProviderSchema>;
 /**
  * Default Ollama Cloud model when none is configured.
  *
- * kimi-k2.6:cloud is Moonshot's latest. Chosen over deepseek-v4-pro:cloud
- * because deepseek's compat-layer thinking depth is capped at default-level
- * (no way to invoke "max" through Ollama's Anthropic-compat endpoint —
- * filed upstream at ollama/ollama#15952), which made the agent feel
- * underpowered. kimi-k2.6 doesn't need a thinking knob to perform well.
+ * glm-5.2:cloud (z.ai, 744B MoE, 1M context). Chosen after a 16-task
+ * agent-sidebar benchmark against kimi-k2.7-code:cloud: GLM 5.2 was
+ * consistently the faster model at parity final-answer quality (both clearly
+ * beat the prior kimi-k2.6 default). Reasoning is returned in proper
+ * `thinking` blocks on Ollama's Anthropic-compat endpoint (the agent path),
+ * so chain-of-thought does not leak into drafts; non-agent features use the
+ * native /api/chat path with think:true, which likewise keeps CoT out of
+ * parsed JSON. Note: GLM emits brief inter-turn progress text in agent loops
+ * (visible as status lines in the sidebar) — suppressible via the agent
+ * system prompt if undesired.
  *
  * Note: cloud models may occasionally return overloaded_error during peak
  * traffic — llm-service.ts retry logic catches this via Anthropic.APIError
  * status 529 (rate_limit category) and backs off automatically.
  */
-export const DEFAULT_OLLAMA_MODEL = "kimi-k2.6:cloud";
+export const DEFAULT_OLLAMA_MODEL = "glm-5.2:cloud";
 
 export const OllamaCloudConfigSchema = z.object({
   apiKey: z.string().default(""),

From 226806d836b6e10edb227ec194934b84ff9bf070 Mon Sep 17 00:00:00 2001
From: Ankit Gupta <ankit@ycombinator.com>
Date: Tue, 16 Jun 2026 17:17:36 -0400
Subject: [PATCH 2/2] fix(evals): eval harness used retired default model
 (claude-sonnet-4-20250514)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The eval runners constructed services with no model argument, falling back to
the retired legacy constructor default `claude-sonnet-4-20250514`, which now
404s — breaking `npm run eval` (and the pre-pr eval gate) for everyone. The app
itself is unaffected because it passes the resolved live model.

Pass each eval service the app's actual default model via
resolveModelId(DEFAULT_MODEL_CONFIG[feature]) so the eval tracks real default
resolution and can't rot when a model id is retired:
- runner.ts (analyzer)
- features/draft-generator.ts
- features/calendaring-agent.ts
- features/archive-ready-analyzer.ts

Verified: analyzer eval 10/10 no regressions; feature evals pass with +0.0
deltas vs baseline.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 tests/evals/features/archive-ready-analyzer.ts | 3 ++-
 tests/evals/features/calendaring-agent.ts      | 3 ++-
 tests/evals/features/draft-generator.ts        | 3 ++-
 tests/evals/runner.ts                          | 5 ++++-
 4 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/tests/evals/features/archive-ready-analyzer.ts b/tests/evals/features/archive-ready-analyzer.ts
index 6d8ff048..5e8537ad 100644
--- a/tests/evals/features/archive-ready-analyzer.ts
+++ b/tests/evals/features/archive-ready-analyzer.ts
@@ -7,6 +7,7 @@
  */
 import { ArchiveReadyAnalyzer } from "../../../src/main/services/archive-ready-analyzer";
 import type { DashboardEmail } from "../../../src/shared/types";
+import { resolveModelId, DEFAULT_MODEL_CONFIG } from "../../../src/shared/types";
 
 interface ArchiveReadyFixtureInput {
   thread: DashboardEmail[];
@@ -28,7 +29,7 @@ export async function runArchiveReadyFixture(
       `[archive-ready] fixture ${fixtureId}: input must be { thread: DashboardEmail[], userEmail? }`,
     );
   }
-  const analyzer = new ArchiveReadyAnalyzer();
+  const analyzer = new ArchiveReadyAnalyzer(resolveModelId(DEFAULT_MODEL_CONFIG.archiveReady));
   const result = await analyzer.analyzeThread(input.thread, input.userEmail);
   return JSON.stringify(result, null, 2);
 }
diff --git a/tests/evals/features/calendaring-agent.ts b/tests/evals/features/calendaring-agent.ts
index 8104f9f4..66f7e2ef 100644
--- a/tests/evals/features/calendaring-agent.ts
+++ b/tests/evals/features/calendaring-agent.ts
@@ -7,6 +7,7 @@
  */
 import { CalendaringAgent } from "../../../src/main/services/calendaring-agent";
 import type { Email } from "../../../src/shared/types";
+import { resolveModelId, DEFAULT_MODEL_CONFIG } from "../../../src/shared/types";
 
 interface CalendaringFixtureInput {
   email: Email;
@@ -28,7 +29,7 @@ export async function runCalendaringFixture(
   if (!isInput(input)) {
     throw new Error(`[calendaring-agent] fixture ${fixtureId}: input must be { email }`);
   }
-  const agent = new CalendaringAgent();
+  const agent = new CalendaringAgent(resolveModelId(DEFAULT_MODEL_CONFIG.calendaring));
   const result = await agent.analyze(input.email);
   return JSON.stringify(result, null, 2);
 }
diff --git a/tests/evals/features/draft-generator.ts b/tests/evals/features/draft-generator.ts
index 19646894..2efc1f4d 100644
--- a/tests/evals/features/draft-generator.ts
+++ b/tests/evals/features/draft-generator.ts
@@ -10,6 +10,7 @@
  */
 
 import { DraftGenerator } from "../../../src/main/services/draft-generator";
+import { resolveModelId, DEFAULT_MODEL_CONFIG } from "../../../src/shared/types";
 import type { Email, AnalysisResult } from "../../../src/shared/types";
 
 interface DraftGeneratorFixtureInput {
@@ -42,7 +43,7 @@ export async function runDraftGeneratorFixture(
   // ships, not a test-only configuration. EA + sender lookup are off so
   // we isolate the draft-generation behavior; those flows have their
   // own (TODO) eval suites.
-  const generator = new DraftGenerator();
+  const generator = new DraftGenerator(resolveModelId(DEFAULT_MODEL_CONFIG.drafts));
   const response = await generator.generateDraft(input.email, input.analysis, undefined, {
     enableSenderLookup: false,
   });
diff --git a/tests/evals/runner.ts b/tests/evals/runner.ts
index dc332348..731a2bc3 100644
--- a/tests/evals/runner.ts
+++ b/tests/evals/runner.ts
@@ -13,6 +13,7 @@ import { readFileSync, readdirSync, writeFileSync } from "fs";
 import { join } from "path";
 import { EmailAnalyzer } from "../../src/main/services/email-analyzer";
 import type { Email, AnalysisResult } from "../../src/shared/types";
+import { resolveModelId, DEFAULT_MODEL_CONFIG } from "../../src/shared/types";
 import {
   scoreDeterministic,
   type DeterministicResult,
@@ -74,7 +75,9 @@ function saveBaseline(scores: Record<string, number>): void {
 // --- Runner ---
 
 async function runEval(fixtures: EvalFixture[]): Promise<EvalReport> {
-  const analyzer = new EmailAnalyzer();
+  // Pass the app's actual default analysis model. Constructing with no args
+  // falls back to the retired legacy default (claude-sonnet-4-20250514 → 404).
+  const analyzer = new EmailAnalyzer(resolveModelId(DEFAULT_MODEL_CONFIG.analysis));
   const results: EvalReport["results"] = [];
   const regressions: string[] = [];
   const baseline = loadBaseline();