diff --git a/packages/semantic-cache/CHANGELOG.md b/packages/semantic-cache/CHANGELOG.md index a8e76a56..301788d7 100644 --- a/packages/semantic-cache/CHANGELOG.md +++ b/packages/semantic-cache/CHANGELOG.md @@ -5,6 +5,26 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.6.0] - Unreleased + +### Added + +- **Judge cancellation via AbortSignal** — the `judgeFn` input now includes a + `signal: AbortSignal`. The cache creates an `AbortController` per judge + invocation and aborts it when `timeoutMs` elapses. Forwarding `signal` to your + LLM client (`fetch`, OpenAI/Anthropic SDKs) cancels the in-flight request on + timeout instead of leaking it to completion and burning API quota. The + previously documented limitation on `JudgeOptions.timeoutMs` ("the underlying + promise is not cancelled") no longer applies when the signal is forwarded. + +### Changed + +- `JudgeOptions.timeoutMs` JSDoc updated to describe the new abort behavior. + +### Breaking changes + +None. The `signal` field is additive — a `judgeFn` that ignores it is unaffected. + ## [0.5.0] - 2026-05-13 ### Added diff --git a/packages/semantic-cache/README.md b/packages/semantic-cache/README.md index 63a3c2a5..da53a1d5 100644 --- a/packages/semantic-cache/README.md +++ b/packages/semantic-cache/README.md @@ -103,16 +103,19 @@ When a hit lands in the uncertainty band (`threshold - uncertaintyBand < score < ```typescript const result = await cache.check(userPrompt, { judge: { - judgeFn: async ({ prompt, response, similarity, threshold, category }) => { + judgeFn: async ({ prompt, response, signal }) => { // Return true to accept (confidence → 'high') // Return false to reject (treated as miss with nearestMiss) - const verdict = await openai.chat.completions.create({ - model: 'gpt-5-mini', - messages: [ - { role: 'system', content: 'Reply YES or NO only.' }, - { role: 'user', content: `Does this cached response correctly answer the prompt?\nPrompt: ${prompt}\nResponse: ${response}` }, - ], - }); + const verdict = await openai.chat.completions.create( + { + model: 'gpt-5-mini', + messages: [ + { role: 'system', content: 'Reply YES or NO only.' }, + { role: 'user', content: `Does this cached response correctly answer the prompt?\nPrompt: ${prompt}\nResponse: ${response}` }, + ], + }, + { signal }, // forwarded — cancelled if the judge times out + ); return verdict.choices[0].message.content?.startsWith('YES') ?? false; }, onError: 'accept', // fail-open on judge errors (default) @@ -121,6 +124,8 @@ const result = await cache.check(userPrompt, { }); ``` +Forward `signal` to your LLM client so a slow judge call is cancelled on timeout rather than running to completion in the background. + **When the judge is invoked:** only for `confidence === 'uncertain'` hits. High-confidence hits, misses, and the zero-candidates case bypass the judge entirely. **Accept path:** `result.hit === true`, `result.confidence === 'high'`. diff --git a/packages/semantic-cache/examples/judge/README.md b/packages/semantic-cache/examples/judge/README.md index c8ec4b12..ce500fc7 100644 --- a/packages/semantic-cache/examples/judge/README.md +++ b/packages/semantic-cache/examples/judge/README.md @@ -26,7 +26,7 @@ pnpm start ## Key concepts -- `judgeFn` receives `{ prompt, response, similarity, threshold, category }` +- `judgeFn` receives `{ prompt, response, similarity, threshold, category, signal }` - Return `true` to accept (→ `confidence: 'high'`) - Return `false` to reject (→ `hit: false`, `nearestMiss.deltaToThreshold <= 0`) - `onError: 'accept'` makes the judge fail-open (safe default) diff --git a/packages/semantic-cache/examples/judge/index.ts b/packages/semantic-cache/examples/judge/index.ts index 7381a976..e61d4c99 100644 --- a/packages/semantic-cache/examples/judge/index.ts +++ b/packages/semantic-cache/examples/judge/index.ts @@ -36,13 +36,15 @@ const cache = new SemanticCache({ }); // Mock judge: accepts if the response contains key words from the prompt. -// Replace this with a real LLM call in production. +// Replace this with a real LLM call in production; pass input.signal to your +// HTTP client (fetch, OpenAI/Anthropic SDK) so timeout cancels the request. async function mockJudge(input: { prompt: string; response: string; similarity: number; threshold: number; category: string | undefined; + signal: AbortSignal; }): Promise { const promptWords = new Set(input.prompt.toLowerCase().split(/\W+/).filter(Boolean)); const responseWords = input.response.toLowerCase().split(/\W+/).filter(Boolean); diff --git a/packages/semantic-cache/src/SemanticCache.ts b/packages/semantic-cache/src/SemanticCache.ts index 279f03c3..b1a1d37b 100644 --- a/packages/semantic-cache/src/SemanticCache.ts +++ b/packages/semantic-cache/src/SemanticCache.ts @@ -405,6 +405,7 @@ export class SemanticCache { | 'timeout_accept' | 'timeout_reject'; let decision: JudgeDecision; + const judgeController = new AbortController(); try { const accepted = await raceWithTimeout( options.judge.judgeFn({ @@ -413,11 +414,14 @@ export class SemanticCache { similarity: winnerScore, threshold, category: category || undefined, + signal: judgeController.signal, }), timeoutMs, + () => judgeController.abort(), ); decision = accepted ? 'accept' : 'reject'; } catch (err) { + // raceWithTimeout already aborted the controller on the timeout path. const isTimeout = err instanceof JudgeTimeoutError; if (onError === 'accept') { decision = isTimeout ? 'timeout_accept' : 'error_accept'; @@ -1559,10 +1563,17 @@ class JudgeTimeoutError extends Error { } } -function raceWithTimeout(p: Promise, timeoutMs: number): Promise { +function raceWithTimeout( + p: Promise, + timeoutMs: number, + onTimeout?: () => void, +): Promise { let timer!: ReturnType; const timeout = new Promise((_, reject) => { - timer = setTimeout(() => reject(new JudgeTimeoutError()), timeoutMs); + timer = setTimeout(() => { + onTimeout?.(); + reject(new JudgeTimeoutError()); + }, timeoutMs); }); return Promise.race([p, timeout]).finally(() => clearTimeout(timer)); } diff --git a/packages/semantic-cache/src/__tests__/judge.test.ts b/packages/semantic-cache/src/__tests__/judge.test.ts index bfea621e..4b646b49 100644 --- a/packages/semantic-cache/src/__tests__/judge.test.ts +++ b/packages/semantic-cache/src/__tests__/judge.test.ts @@ -415,10 +415,75 @@ describe('judgeFn receives correct inputs', () => { similarity: expect.closeTo(0.08, 5), threshold: THRESHOLD, category: 'trivia', + signal: expect.any(AbortSignal), }); }); }); +// --- AbortSignal on judgeFn --- + +describe('judgeFn AbortSignal', () => { + it('supplies an AbortSignal to judgeFn', async () => { + const registry = new Registry(); + const client = makeMockClient(BORDERLINE_SCORE); + const cache = await makeCache(client, registry, 'test_judge_signal_supplied'); + let receivedSignal: AbortSignal | undefined; + const judgeFn = vi.fn(async (input: { signal: AbortSignal }) => { + receivedSignal = input.signal; + return true; + }); + + await cache.check('hello', { judge: { judgeFn } }); + + expect(receivedSignal).toBeDefined(); + expect(receivedSignal).toBeInstanceOf(AbortSignal); + expect('aborted' in (receivedSignal as AbortSignal)).toBe(true); + }); + + it('aborts the judgeFn signal when the judge times out', async () => { + const registry = new Registry(); + const client = makeMockClient(BORDERLINE_SCORE); + const cache = await makeCache(client, registry, 'test_judge_signal_timeout'); + let observedAbort = false; + const judgeFn = vi.fn((input: { signal: AbortSignal }) => { + return new Promise(() => { + input.signal.addEventListener('abort', () => { + observedAbort = true; + // Stay pending so raceWithTimeout's timeout branch wins (real clients + // would reject here; resolving would promote to confidence: 'high'). + }); + }); + }); + + const result = await cache.check('hello', { + judge: { judgeFn, onError: 'accept', timeoutMs: 20 }, + }); + + expect(observedAbort).toBe(true); + expect(result.hit).toBe(true); + expect(result.confidence).toBe('uncertain'); + }); + + it('does not abort the signal when judgeFn resolves in time', async () => { + const registry = new Registry(); + const client = makeMockClient(BORDERLINE_SCORE); + const cache = await makeCache(client, registry, 'test_judge_signal_no_abort'); + let abortedDuringCall = false; + const judgeFn = vi.fn(async (input: { signal: AbortSignal }) => { + abortedDuringCall = input.signal.aborted; + return true; + }); + + const result = await cache.check('hello', { + judge: { judgeFn, timeoutMs: 5000 }, + }); + + expect(abortedDuringCall).toBe(false); + expect(result.hit).toBe(true); + expect(result.confidence).toBe('high'); + }); +}); + // --- Test 12: checkBatch with judge throws SemanticCacheUsageError --- describe('checkBatch with judge throws SemanticCacheUsageError', () => { diff --git a/packages/semantic-cache/src/types.ts b/packages/semantic-cache/src/types.ts index 1fc18baf..d1494053 100644 --- a/packages/semantic-cache/src/types.ts +++ b/packages/semantic-cache/src/types.ts @@ -170,6 +170,14 @@ export interface JudgeOptions { similarity: number; threshold: number; category: string | undefined; + /** + * Abort signal owned by the cache. It fires when the judge `timeoutMs` + * elapses. Forward it to your LLM client (e.g. `fetch(url, { signal })`, + * the OpenAI/Anthropic SDK `signal` option) so the in-flight request is + * cancelled on timeout instead of running to completion and billing. + * A judgeFn that ignores this field still works — it just won't cancel. + */ + signal: AbortSignal; }) => Promise; /** @@ -183,14 +191,14 @@ export interface JudgeOptions { /** * Per-call timeout in milliseconds. Default: 2000. - * The judge function is raced against this timeout; timeout is treated + * The judge function is raced against this timeout; a timeout is treated * the same as a thrown error and routed through onError. * - * Note: the underlying promise is not cancelled on timeout — JavaScript has - * no built-in cancellation primitive. A real LLM HTTP request will continue - * running in the background after the timeout fires, consuming API quota. - * To stop the underlying request, use an AbortController inside judgeFn and - * abort it when the signal you manage fires. + * On timeout the cache aborts the `AbortSignal` passed to judgeFn (see the + * `signal` field on the judgeFn input). Forward that signal to your LLM + * client to cancel the in-flight request and stop consuming API quota. + * If judgeFn ignores the signal, the underlying request will still run to + * completion in the background — JavaScript cannot force-cancel a promise. */ timeoutMs?: number; }