BetterDB-inc · amitkojha05 · May 21, 2026
diff --git a/packages/semantic-cache/CHANGELOG.md b/packages/semantic-cache/CHANGELOG.md
@@ -5,6 +5,26 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.6.0] - Unreleased
+
+### Added
+
+- **Judge cancellation via AbortSignal** — the `judgeFn` input now includes a
+  `signal: AbortSignal`. The cache creates an `AbortController` per judge
+  invocation and aborts it when `timeoutMs` elapses. Forwarding `signal` to your
+  LLM client (`fetch`, OpenAI/Anthropic SDKs) cancels the in-flight request on
+  timeout instead of leaking it to completion and burning API quota. The
+  previously documented limitation on `JudgeOptions.timeoutMs` ("the underlying
+  promise is not cancelled") no longer applies when the signal is forwarded.
+
+### Changed
+
+- `JudgeOptions.timeoutMs` JSDoc updated to describe the new abort behavior.
+
+### Breaking changes
+
+None. The `signal` field is additive — a `judgeFn` that ignores it is unaffected.
+
 ## [0.5.0] - 2026-05-13
 
 ### Added

diff --git a/packages/semantic-cache/README.md b/packages/semantic-cache/README.md
@@ -103,16 +103,19 @@ When a hit lands in the uncertainty band (`threshold - uncertaintyBand < score <
 ```typescript
 const result = await cache.check(userPrompt, {
   judge: {
-    judgeFn: async ({ prompt, response, similarity, threshold, category }) => {
+    judgeFn: async ({ prompt, response, signal }) => {
       // Return true to accept (confidence → 'high')
       // Return false to reject (treated as miss with nearestMiss)
-      const verdict = await openai.chat.completions.create({
-        model: 'gpt-5-mini',
-        messages: [
-          { role: 'system', content: 'Reply YES or NO only.' },
-          { role: 'user', content: `Does this cached response correctly answer the prompt?\nPrompt: ${prompt}\nResponse: ${response}` },
-        ],
-      });
+      const verdict = await openai.chat.completions.create(
+        {
+          model: 'gpt-5-mini',
+          messages: [
+            { role: 'system', content: 'Reply YES or NO only.' },
+            { role: 'user', content: `Does this cached response correctly answer the prompt?\nPrompt: ${prompt}\nResponse: ${response}` },
+          ],
+        },
+        { signal }, // forwarded — cancelled if the judge times out
+      );
       return verdict.choices[0].message.content?.startsWith('YES') ?? false;
     },
     onError: 'accept',  // fail-open on judge errors (default)
@@ -121,6 +124,8 @@ const result = await cache.check(userPrompt, {
 });
 ```
 
+Forward `signal` to your LLM client so a slow judge call is cancelled on timeout rather than running to completion in the background.
+
 **When the judge is invoked:** only for `confidence === 'uncertain'` hits. High-confidence hits, misses, and the zero-candidates case bypass the judge entirely.
 
 **Accept path:** `result.hit === true`, `result.confidence === 'high'`.

diff --git a/packages/semantic-cache/examples/judge/README.md b/packages/semantic-cache/examples/judge/README.md
@@ -26,7 +26,7 @@ pnpm start
 
 ## Key concepts
 
-- `judgeFn` receives `{ prompt, response, similarity, threshold, category }`
+- `judgeFn` receives `{ prompt, response, similarity, threshold, category, signal }`
 - Return `true` to accept (→ `confidence: 'high'`)
 - Return `false` to reject (→ `hit: false`, `nearestMiss.deltaToThreshold <= 0`)
 - `onError: 'accept'` makes the judge fail-open (safe default)

diff --git a/packages/semantic-cache/examples/judge/index.ts b/packages/semantic-cache/examples/judge/index.ts
@@ -36,13 +36,15 @@ const cache = new SemanticCache({
 });
 
 // Mock judge: accepts if the response contains key words from the prompt.
-// Replace this with a real LLM call in production.
+// Replace this with a real LLM call in production; pass input.signal to your
+// HTTP client (fetch, OpenAI/Anthropic SDK) so timeout cancels the request.
 async function mockJudge(input: {
   prompt: string;
   response: string;
   similarity: number;
   threshold: number;
   category: string | undefined;
+  signal: AbortSignal;
 }): Promise<boolean> {
   const promptWords = new Set(input.prompt.toLowerCase().split(/\W+/).filter(Boolean));
   const responseWords = input.response.toLowerCase().split(/\W+/).filter(Boolean);

diff --git a/packages/semantic-cache/src/SemanticCache.ts b/packages/semantic-cache/src/SemanticCache.ts
@@ -405,6 +405,7 @@ export class SemanticCache {
           | 'timeout_accept' | 'timeout_reject';
         let decision: JudgeDecision;
 
+        const judgeController = new AbortController();
         try {
           const accepted = await raceWithTimeout(
             options.judge.judgeFn({
@@ -413,11 +414,14 @@ export class SemanticCache {
               similarity: winnerScore,
               threshold,
               category: category || undefined,
+              signal: judgeController.signal,
             }),
             timeoutMs,
+            () => judgeController.abort(),
           );
           decision = accepted ? 'accept' : 'reject';
         } catch (err) {
+          // raceWithTimeout already aborted the controller on the timeout path.
           const isTimeout = err instanceof JudgeTimeoutError;
           if (onError === 'accept') {
             decision = isTimeout ? 'timeout_accept' : 'error_accept';
@@ -1559,10 +1563,17 @@ class JudgeTimeoutError extends Error {
   }
 }
 
-function raceWithTimeout<T>(p: Promise<T>, timeoutMs: number): Promise<T> {
+function raceWithTimeout<T>(
+  p: Promise<T>,
+  timeoutMs: number,
+  onTimeout?: () => void,
+): Promise<T> {
   let timer!: ReturnType<typeof setTimeout>;
   const timeout = new Promise<never>((_, reject) => {
-    timer = setTimeout(() => reject(new JudgeTimeoutError()), timeoutMs);
+    timer = setTimeout(() => {
+      onTimeout?.();
+      reject(new JudgeTimeoutError());
+    }, timeoutMs);
   });
   return Promise.race([p, timeout]).finally(() => clearTimeout(timer));
 }
diff --git a/packages/semantic-cache/src/__tests__/judge.test.ts b/packages/semantic-cache/src/__tests__/judge.test.ts
@@ -415,10 +415,75 @@ describe('judgeFn receives correct inputs', () => {
       similarity: expect.closeTo(0.08, 5),
       threshold: THRESHOLD,
       category: 'trivia',
+      signal: expect.any(AbortSignal),
     });
   });
 });
 
+// --- AbortSignal on judgeFn ---
+
+describe('judgeFn AbortSignal', () => {
+  it('supplies an AbortSignal to judgeFn', async () => {
+    const registry = new Registry();
+    const client = makeMockClient(BORDERLINE_SCORE);
+    const cache = await makeCache(client, registry, 'test_judge_signal_supplied');
+    let receivedSignal: AbortSignal | undefined;
+    const judgeFn = vi.fn(async (input: { signal: AbortSignal }) => {
+      receivedSignal = input.signal;
+      return true;
+    });
+
+    await cache.check('hello', { judge: { judgeFn } });
+
+    expect(receivedSignal).toBeDefined();
+    expect(receivedSignal).toBeInstanceOf(AbortSignal);
+    expect('aborted' in (receivedSignal as AbortSignal)).toBe(true);
+  });
+
+  it('aborts the judgeFn signal when the judge times out', async () => {
+    const registry = new Registry();
+    const client = makeMockClient(BORDERLINE_SCORE);
+    const cache = await makeCache(client, registry, 'test_judge_signal_timeout');
+    let observedAbort = false;
+    const judgeFn = vi.fn((input: { signal: AbortSignal }) => {
+      return new Promise<boolean>(() => {
+        input.signal.addEventListener('abort', () => {
+          observedAbort = true;
+          // Stay pending so raceWithTimeout's timeout branch wins (real clients
+          // would reject here; resolving would promote to confidence: 'high').
+        });
+      });
+    });
+
+    const result = await cache.check('hello', {
+      judge: { judgeFn, onError: 'accept', timeoutMs: 20 },
+    });
+
+    expect(observedAbort).toBe(true);
+    expect(result.hit).toBe(true);
+    expect(result.confidence).toBe('uncertain');
+  });
+
+  it('does not abort the signal when judgeFn resolves in time', async () => {
+    const registry = new Registry();
+    const client = makeMockClient(BORDERLINE_SCORE);
+    const cache = await makeCache(client, registry, 'test_judge_signal_no_abort');
+    let abortedDuringCall = false;
+    const judgeFn = vi.fn(async (input: { signal: AbortSignal }) => {
+      abortedDuringCall = input.signal.aborted;
+      return true;
+    });
+
+    const result = await cache.check('hello', {
+      judge: { judgeFn, timeoutMs: 5000 },
+    });
+
+    expect(abortedDuringCall).toBe(false);
+    expect(result.hit).toBe(true);
+    expect(result.confidence).toBe('high');
+  });
+});
+
 // --- Test 12: checkBatch with judge throws SemanticCacheUsageError ---
 
 describe('checkBatch with judge throws SemanticCacheUsageError', () => {

diff --git a/packages/semantic-cache/src/types.ts b/packages/semantic-cache/src/types.ts
@@ -170,6 +170,14 @@ export interface JudgeOptions {
     similarity: number;
     threshold: number;
     category: string | undefined;
+    /**
+     * Abort signal owned by the cache. It fires when the judge `timeoutMs`
+     * elapses. Forward it to your LLM client (e.g. `fetch(url, { signal })`,
+     * the OpenAI/Anthropic SDK `signal` option) so the in-flight request is
+     * cancelled on timeout instead of running to completion and billing.
+     * A judgeFn that ignores this field still works — it just won't cancel.
+     */
+    signal: AbortSignal;
   }) => Promise<boolean>;
 
   /**
@@ -183,14 +191,14 @@ export interface JudgeOptions {
 
   /**
    * Per-call timeout in milliseconds. Default: 2000.
-   * The judge function is raced against this timeout; timeout is treated
+   * The judge function is raced against this timeout; a timeout is treated
    * the same as a thrown error and routed through onError.
    *
-   * Note: the underlying promise is not cancelled on timeout — JavaScript has
-   * no built-in cancellation primitive. A real LLM HTTP request will continue
-   * running in the background after the timeout fires, consuming API quota.
-   * To stop the underlying request, use an AbortController inside judgeFn and
-   * abort it when the signal you manage fires.
+   * On timeout the cache aborts the `AbortSignal` passed to judgeFn (see the
+   * `signal` field on the judgeFn input). Forward that signal to your LLM
+   * client to cancel the in-flight request and stop consuming API quota.
+   * If judgeFn ignores the signal, the underlying request will still run to
+   * completion in the background — JavaScript cannot force-cancel a promise.
    */
   timeoutMs?: number;
 }