Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 11 additions & 6 deletions src/shared/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -368,17 +368,22 @@ export type SenderLookupProvider = z.infer<typeof SenderLookupProviderSchema>;
/**
* Default Ollama Cloud model when none is configured.
*
* kimi-k2.6:cloud is Moonshot's latest. Chosen over deepseek-v4-pro:cloud
* because deepseek's compat-layer thinking depth is capped at default-level
* (no way to invoke "max" through Ollama's Anthropic-compat endpoint —
* filed upstream at ollama/ollama#15952), which made the agent feel
* underpowered. kimi-k2.6 doesn't need a thinking knob to perform well.
* glm-5.2:cloud (z.ai, 744B MoE, 1M context). Chosen after a 16-task
* agent-sidebar benchmark against kimi-k2.7-code:cloud: GLM 5.2 was
* consistently the faster model at parity final-answer quality (both clearly
* beat the prior kimi-k2.6 default). Reasoning is returned in proper
* `thinking` blocks on Ollama's Anthropic-compat endpoint (the agent path),
* so chain-of-thought does not leak into drafts; non-agent features use the
* native /api/chat path with think:true, which likewise keeps CoT out of
* parsed JSON. Note: GLM emits brief inter-turn progress text in agent loops
* (visible as status lines in the sidebar) — suppressible via the agent
* system prompt if undesired.
*
* Note: cloud models may occasionally return overloaded_error during peak
* traffic — llm-service.ts retry logic catches this via Anthropic.APIError
* status 529 (rate_limit category) and backs off automatically.
*/
export const DEFAULT_OLLAMA_MODEL = "kimi-k2.6:cloud";
export const DEFAULT_OLLAMA_MODEL = "glm-5.2:cloud";

export const OllamaCloudConfigSchema = z.object({
apiKey: z.string().default(""),
Expand Down
3 changes: 2 additions & 1 deletion tests/evals/features/archive-ready-analyzer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
*/
import { ArchiveReadyAnalyzer } from "../../../src/main/services/archive-ready-analyzer";
import type { DashboardEmail } from "../../../src/shared/types";
import { resolveModelId, DEFAULT_MODEL_CONFIG } from "../../../src/shared/types";

interface ArchiveReadyFixtureInput {
thread: DashboardEmail[];
Expand All @@ -28,7 +29,7 @@ export async function runArchiveReadyFixture(
`[archive-ready] fixture ${fixtureId}: input must be { thread: DashboardEmail[], userEmail? }`,
);
}
const analyzer = new ArchiveReadyAnalyzer();
const analyzer = new ArchiveReadyAnalyzer(resolveModelId(DEFAULT_MODEL_CONFIG.archiveReady));
const result = await analyzer.analyzeThread(input.thread, input.userEmail);
return JSON.stringify(result, null, 2);
}
3 changes: 2 additions & 1 deletion tests/evals/features/calendaring-agent.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
*/
import { CalendaringAgent } from "../../../src/main/services/calendaring-agent";
import type { Email } from "../../../src/shared/types";
import { resolveModelId, DEFAULT_MODEL_CONFIG } from "../../../src/shared/types";

interface CalendaringFixtureInput {
email: Email;
Expand All @@ -28,7 +29,7 @@ export async function runCalendaringFixture(
if (!isInput(input)) {
throw new Error(`[calendaring-agent] fixture ${fixtureId}: input must be { email }`);
}
const agent = new CalendaringAgent();
const agent = new CalendaringAgent(resolveModelId(DEFAULT_MODEL_CONFIG.calendaring));
const result = await agent.analyze(input.email);
return JSON.stringify(result, null, 2);
}
3 changes: 2 additions & 1 deletion tests/evals/features/draft-generator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
*/

import { DraftGenerator } from "../../../src/main/services/draft-generator";
import { resolveModelId, DEFAULT_MODEL_CONFIG } from "../../../src/shared/types";
import type { Email, AnalysisResult } from "../../../src/shared/types";

interface DraftGeneratorFixtureInput {
Expand Down Expand Up @@ -42,7 +43,7 @@ export async function runDraftGeneratorFixture(
// ships, not a test-only configuration. EA + sender lookup are off so
// we isolate the draft-generation behavior; those flows have their
// own (TODO) eval suites.
const generator = new DraftGenerator();
const generator = new DraftGenerator(resolveModelId(DEFAULT_MODEL_CONFIG.drafts));
const response = await generator.generateDraft(input.email, input.analysis, undefined, {
enableSenderLookup: false,
});
Expand Down
5 changes: 4 additions & 1 deletion tests/evals/runner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import { readFileSync, readdirSync, writeFileSync } from "fs";
import { join } from "path";
import { EmailAnalyzer } from "../../src/main/services/email-analyzer";
import type { Email, AnalysisResult } from "../../src/shared/types";
import { resolveModelId, DEFAULT_MODEL_CONFIG } from "../../src/shared/types";
import {
scoreDeterministic,
type DeterministicResult,
Expand Down Expand Up @@ -74,7 +75,9 @@ function saveBaseline(scores: Record<string, number>): void {
// --- Runner ---

async function runEval(fixtures: EvalFixture[]): Promise<EvalReport> {
const analyzer = new EmailAnalyzer();
// Pass the app's actual default analysis model. Constructing with no args
// falls back to the retired legacy default (claude-sonnet-4-20250514 → 404).
const analyzer = new EmailAnalyzer(resolveModelId(DEFAULT_MODEL_CONFIG.analysis));
const results: EvalReport["results"] = [];
const regressions: string[] = [];
const baseline = loadBaseline();
Expand Down
Loading