diff --git a/.gitignore b/.gitignore
index bdce9acf..29288a44 100644
--- a/.gitignore
+++ b/.gitignore
@@ -82,3 +82,8 @@ packages/mcp/.mcpregistry_github_token
 .gitignore
 packages/mcp/.mcpregistry_registry_token
 data/license.key
+
+# LongMemEval retrieval harness: embedding cache + downloaded datasets
+packages/retrieval/eval/longmemeval/.cache/
+packages/retrieval/eval/longmemeval/longmemeval_*.json
+packages/retrieval/eval/longmemeval/*_oracle.json
diff --git a/packages/retrieval/eval/longmemeval/adapter.ts b/packages/retrieval/eval/longmemeval/adapter.ts
new file mode 100644
index 00000000..32389b36
--- /dev/null
+++ b/packages/retrieval/eval/longmemeval/adapter.ts
@@ -0,0 +1,97 @@
+import type { UpsertEntry, QueryHit } from '../../src/index';
+import type { ChunkMode, LmeRecord, LmeSession } from './types';
+
+// text-embedding-3-small accepts at most 8191 tokens per input. Cap each chunk
+// well under that (~4 chars/token heuristic, with margin) so a long session is
+// split into multiple chunks instead of failing the embedding call. Every part
+// keeps the session's session_id, so recall (which matches on session_id) is
+// unaffected.
+const MAX_EMBED_CHARS = 24000;
+
+/** Hard-slice a string into consecutive pieces each at most `budget` chars. */
+function sliceToBudget(text: string, budget: number): string[] {
+  const parts: string[] = [];
+  for (let i = 0; i < text.length; i += budget) {
+    parts.push(text.slice(i, i + budget));
+  }
+  return parts;
+}
+
+/** Pack a session's turns into newline-joined chunks each within `budget`. */
+function packTurns(session: LmeSession, budget: number): string[] {
+  const lines: string[] = [];
+  for (const turn of session) {
+    const line = `${turn.role}: ${turn.content}`;
+    if (line.length <= budget) {
+      lines.push(line);
+    } else {
+      // A single turn larger than the budget is hard-sliced so it still embeds.
+      lines.push(...sliceToBudget(line, budget));
+    }
+  }
+  const chunks: string[] = [];
+  let current = '';
+  for (const line of lines) {
+    if (current.length > 0 && current.length + 1 + line.length > budget) {
+      chunks.push(current);
+      current = line;
+    } else {
+      current = current.length === 0 ? line : `${current}\n${line}`;
+    }
+  }
+  if (current.length > 0) chunks.push(current);
+  return chunks;
+}
+
+/**
+ * Turn a LongMemEval haystack into UpsertEntry chunks.
+ * - 'session' (default): one chunk per session (turns joined); sessions longer
+ *   than the embedder's input budget are split into multiple chunks that all
+ *   carry the same session_id.
+ * - 'turn': one chunk per turn.
+ * The id encodes the session index (+ turn/part index when split); fields carry
+ * the session_id tag (+ date tag when present) so recall can match evidence.
+ */
+export function chunkRecord(record: LmeRecord, mode: ChunkMode): UpsertEntry[] {
+  const entries: UpsertEntry[] = [];
+  record.haystack_sessions.forEach((session, sIdx) => {
+    const sessionId = record.haystack_session_ids[sIdx] ?? `session_${sIdx}`;
+    const date = record.haystack_dates?.[sIdx];
+    const baseFields: Record<string, string> = { session_id: sessionId };
+    if (date !== undefined && date !== '') {
+      baseFields.date = date;
+    }
+
+    if (mode === 'turn') {
+      session.forEach((turn, tIdx) => {
+        const text = `${turn.role}: ${turn.content}`;
+        // A single turn can exceed the embedder budget too; hard-slice it like
+        // session mode so it still embeds instead of failing the chunk.
+        const parts = text.length <= MAX_EMBED_CHARS ? [text] : sliceToBudget(text, MAX_EMBED_CHARS);
+        parts.forEach((part, pIdx) => {
+          entries.push({
+            id: parts.length === 1 ? `s${sIdx}_t${tIdx}` : `s${sIdx}_t${tIdx}_p${pIdx}`,
+            text: part,
+            fields: { ...baseFields },
+          });
+        });
+      });
+    } else {
+      const parts = packTurns(session, MAX_EMBED_CHARS);
+      parts.forEach((text, pIdx) => {
+        entries.push({
+          id: parts.length === 1 ? `s${sIdx}` : `s${sIdx}_p${pIdx}`,
+          text,
+          fields: { ...baseFields },
+        });
+      });
+    }
+  });
+  return entries;
+}
+
+/** A record is a recall HIT if any retrieved chunk's session_id is evidence. */
+export function recordIsHit(hits: QueryHit[], answerSessionIds: string[]): boolean {
+  const evidence = new Set(answerSessionIds);
+  return hits.some((hit) => evidence.has(hit.fields.session_id));
+}
diff --git a/packages/retrieval/eval/longmemeval/dataset.ts b/packages/retrieval/eval/longmemeval/dataset.ts
new file mode 100644
index 00000000..1a8670ef
--- /dev/null
+++ b/packages/retrieval/eval/longmemeval/dataset.ts
@@ -0,0 +1,28 @@
+import { readFile } from 'node:fs/promises';
+import { fileURLToPath } from 'node:url';
+import { dirname, join } from 'node:path';
+import type { LmeRecord } from './types';
+
+function fixturePath(): string {
+  return join(dirname(fileURLToPath(import.meta.url)), 'fixture.json');
+}
+
+/** Load the bundled LongMemEval-shaped fixture (offline, deterministic). */
+export async function loadFixture(): Promise<LmeRecord[]> {
+  const raw = await readFile(fixturePath(), 'utf8');
+  return JSON.parse(raw) as LmeRecord[];
+}
+
+/**
+ * Load the dataset: the real LongMemEval json at `dataPath` when given, else
+ * the bundled fixture. Returns records plus a human-readable source label.
+ */
+export async function loadDataset(
+  dataPath: string | undefined,
+): Promise<{ records: LmeRecord[]; source: string }> {
+  if (dataPath !== undefined && dataPath !== '') {
+    const raw = await readFile(dataPath, 'utf8');
+    return { records: JSON.parse(raw) as LmeRecord[], source: dataPath };
+  }
+  return { records: await loadFixture(), source: 'bundled fixture' };
+}
diff --git a/packages/retrieval/eval/longmemeval/embed.ts b/packages/retrieval/eval/longmemeval/embed.ts
new file mode 100644
index 00000000..a7400e3c
--- /dev/null
+++ b/packages/retrieval/eval/longmemeval/embed.ts
@@ -0,0 +1,120 @@
+import { createHash } from 'node:crypto';
+import { mkdir, readFile, writeFile } from 'node:fs/promises';
+import { dirname } from 'node:path';
+import type { Embedder } from './types';
+
+const MOCK_DIM = 256;
+const OPENAI_MODEL = 'text-embedding-3-small';
+const OPENAI_DIM = 1536;
+
+function tokenize(text: string): string[] {
+  return text
+    .toLowerCase()
+    .split(/[^a-z0-9]+/)
+    .filter((t) => t.length > 0);
+}
+
+function l2normalize(vec: number[]): number[] {
+  const norm = Math.sqrt(vec.reduce((sum, v) => sum + v * v, 0)) || 1;
+  return vec.map((v) => v / norm);
+}
+
+/**
+ * Deterministic hashed bag-of-words embedding. Each token is hashed into a few
+ * fixed dimensions; lexical overlap raises cosine similarity. Enough to prove
+ * ranking, not a real semantic score. No network, no keys.
+ */
+export function createMockEmbedder(dim = MOCK_DIM): Embedder {
+  return {
+    name: `mock-hashed-bow(dim=${dim})`,
+    dims: dim,
+    embed: async (text: string) => {
+      const vec = new Array<number>(dim).fill(0);
+      for (const token of tokenize(text)) {
+        const h = createHash('sha256').update(token).digest();
+        // Spread each token across 4 slots with signed weights.
+        for (let s = 0; s < 4; s++) {
+          const idx = h.readUInt32LE(s * 4) % dim;
+          const sign = (h[s * 4 + 3] & 1) === 0 ? 1 : -1;
+          vec[idx] += sign;
+        }
+      }
+      return l2normalize(vec);
+    },
+  };
+}
+
+interface EmbedCache {
+  get(key: string): number[] | undefined;
+  set(key: string, vec: number[]): void;
+  flush(): Promise<void>;
+}
+
+async function loadCache(path: string): Promise<EmbedCache> {
+  let map = new Map<string, number[]>();
+  let dirty = false;
+  try {
+    const raw = await readFile(path, 'utf8');
+    map = new Map(Object.entries(JSON.parse(raw) as Record<string, number[]>));
+  } catch {
+    // No cache yet; start empty.
+  }
+  return {
+    get: (key) => map.get(key),
+    set: (key, vec) => {
+      map.set(key, vec);
+      dirty = true;
+    },
+    flush: async () => {
+      if (!dirty) return;
+      try {
+        await mkdir(dirname(path), { recursive: true });
+        await writeFile(path, JSON.stringify(Object.fromEntries(map)));
+        dirty = false;
+      } catch (err) {
+        // The on-disk cache is only a cost optimization. At large scale the map
+        // can exceed V8's max string length when serialized; never let a flush
+        // failure discard an otherwise-completed eval — warn and continue.
+        console.warn(`embedding cache flush skipped: ${(err as Error).message}`);
+      }
+    },
+  };
+}
+
+/**
+ * Real OpenAI text-embedding-3-small (1536 dims) with an on-disk,
+ * content-addressed cache so re-runs are cheap and indexing isn't re-billed.
+ */
+export async function createOpenAIEmbedder(
+  apiKey: string,
+  cachePath: string,
+): Promise<Embedder> {
+  const cache = await loadCache(cachePath);
+  return {
+    name: `openai:${OPENAI_MODEL}`,
+    dims: OPENAI_DIM,
+    embed: async (text: string) => {
+      const key = createHash('sha256').update(`${OPENAI_MODEL}\n${text}`).digest('hex');
+      const cached = cache.get(key);
+      if (cached !== undefined) return cached;
+
+      const res = await fetch('https://api.openai.com/v1/embeddings', {
+        method: 'POST',
+        headers: {
+          'Content-Type': 'application/json',
+          Authorization: `Bearer ${apiKey}`,
+        },
+        body: JSON.stringify({ model: OPENAI_MODEL, input: text }),
+      });
+      if (!res.ok) {
+        const body = await res.text();
+        throw new Error(`OpenAI embeddings failed (${res.status}): ${body.slice(0, 300)}`);
+      }
+      const json = (await res.json()) as { data: { embedding: number[] }[] };
+      const vec = json.data[0].embedding;
+      cache.set(key, vec);
+      return vec;
+    },
+    flush: () => cache.flush(),
+  };
+}
diff --git a/packages/retrieval/eval/longmemeval/fixture.json b/packages/retrieval/eval/longmemeval/fixture.json
new file mode 100644
index 00000000..e45a184b
--- /dev/null
+++ b/packages/retrieval/eval/longmemeval/fixture.json
@@ -0,0 +1,130 @@
+[
+  {
+    "question_id": "fix_001",
+    "question_type": "single-session-user",
+    "question": "What programming language do I use at work?",
+    "answer": "Python",
+    "question_date": "2024-05-01",
+    "haystack_session_ids": ["A_s0", "A_s1", "A_s2", "A_s3", "A_s4"],
+    "haystack_dates": ["2024-01-03", "2024-01-10", "2024-02-02", "2024-02-20", "2024-03-15"],
+    "haystack_sessions": [
+      [
+        { "role": "user", "content": "I tried a new ramen place downtown last night." },
+        { "role": "assistant", "content": "How was the broth?" }
+      ],
+      [
+        { "role": "user", "content": "At work I write Python every day for our data pipelines.", "has_answer": true },
+        { "role": "assistant", "content": "Python is a solid choice for data engineering." }
+      ],
+      [
+        { "role": "user", "content": "The weather has been rainy all week here." },
+        { "role": "assistant", "content": "Hopefully it clears up soon." }
+      ],
+      [
+        { "role": "user", "content": "I watched a documentary about deep sea creatures." },
+        { "role": "assistant", "content": "Those are fascinating." }
+      ],
+      [
+        { "role": "user", "content": "My knee felt better after the morning run." },
+        { "role": "assistant", "content": "Glad the recovery is going well." }
+      ]
+    ],
+    "answer_session_ids": ["A_s1"]
+  },
+  {
+    "question_id": "fix_002",
+    "question_type": "single-session-preference",
+    "question": "Which city did I book a flight to?",
+    "answer": "Tokyo",
+    "question_date": "2024-05-02",
+    "haystack_session_ids": ["B_s0", "B_s1", "B_s2", "B_s3", "B_s4"],
+    "haystack_dates": ["2024-01-05", "2024-01-22", "2024-02-11", "2024-03-01", "2024-03-30"],
+    "haystack_sessions": [
+      [
+        { "role": "user", "content": "I reorganized my bookshelf by color this weekend." },
+        { "role": "assistant", "content": "That sounds satisfying." }
+      ],
+      [
+        { "role": "user", "content": "My sourdough starter finally rose properly." },
+        { "role": "assistant", "content": "Nice, persistence pays off." }
+      ],
+      [
+        { "role": "user", "content": "I just booked a flight to Tokyo for next spring.", "has_answer": true },
+        { "role": "assistant", "content": "Exciting, Tokyo in spring is beautiful." }
+      ],
+      [
+        { "role": "user", "content": "The team standup ran long again today." },
+        { "role": "assistant", "content": "Long meetings are draining." }
+      ],
+      [
+        { "role": "user", "content": "I planted tomatoes and basil on the balcony." },
+        { "role": "assistant", "content": "Fresh herbs are the best." }
+      ]
+    ],
+    "answer_session_ids": ["B_s2"]
+  },
+  {
+    "question_id": "fix_003",
+    "question_type": "single-session-user",
+    "question": "What is my dog's name?",
+    "answer": "Rex",
+    "question_date": "2024-05-03",
+    "haystack_session_ids": ["C_s0", "C_s1", "C_s2", "C_s3", "C_s4"],
+    "haystack_dates": ["2024-01-08", "2024-01-19", "2024-02-14", "2024-02-28", "2024-03-21"],
+    "haystack_sessions": [
+      [
+        { "role": "user", "content": "I finished reading a long fantasy novel finally." },
+        { "role": "assistant", "content": "Which series was it?" }
+      ],
+      [
+        { "role": "user", "content": "My dog Rex loves to play fetch in the park every morning.", "has_answer": true },
+        { "role": "assistant", "content": "Rex sounds full of energy." }
+      ],
+      [
+        { "role": "user", "content": "I switched my coffee order to a flat white." },
+        { "role": "assistant", "content": "A good choice." }
+      ],
+      [
+        { "role": "user", "content": "The car needed an oil change this week." },
+        { "role": "assistant", "content": "Routine maintenance helps." }
+      ],
+      [
+        { "role": "user", "content": "I repainted the spare bedroom a soft green." },
+        { "role": "assistant", "content": "Calming color." }
+      ]
+    ],
+    "answer_session_ids": ["C_s1"]
+  },
+  {
+    "question_id": "fix_004",
+    "question_type": "multi-session",
+    "question": "How many siblings do I have?",
+    "answer": "two",
+    "question_date": "2024-05-04",
+    "haystack_session_ids": ["D_s0", "D_s1", "D_s2", "D_s3", "D_s4"],
+    "haystack_dates": ["2024-01-12", "2024-01-25", "2024-02-09", "2024-03-05", "2024-03-27"],
+    "haystack_sessions": [
+      [
+        { "role": "user", "content": "I upgraded my laptop's memory over the weekend." },
+        { "role": "assistant", "content": "More RAM always helps." }
+      ],
+      [
+        { "role": "user", "content": "We tried indoor rock climbing for the first time." },
+        { "role": "assistant", "content": "How did it go?" }
+      ],
+      [
+        { "role": "user", "content": "I grew up with two siblings, an older brother and a younger sister.", "has_answer": true },
+        { "role": "assistant", "content": "A middle child then." }
+      ],
+      [
+        { "role": "user", "content": "The bakery near my office started selling pretzels." },
+        { "role": "assistant", "content": "Warm pretzels are great." }
+      ],
+      [
+        { "role": "user", "content": "I set up a standing desk to fix my posture." },
+        { "role": "assistant", "content": "Your back will thank you." }
+      ]
+    ],
+    "answer_session_ids": ["D_s2"]
+  }
+]
diff --git a/packages/retrieval/eval/longmemeval/judge.ts b/packages/retrieval/eval/longmemeval/judge.ts
new file mode 100644
index 00000000..122e13b7
--- /dev/null
+++ b/packages/retrieval/eval/longmemeval/judge.ts
@@ -0,0 +1,55 @@
+import type { Judge } from './types';
+import { chat } from './reader';
+
+// Judge model. Defaults to gpt-5.5; override with LONGMEMEVAL_JUDGE_MODEL to run
+// a like-for-like comparison config (e.g. gpt-4o) without editing the default.
+const JUDGE_MODEL = process.env.LONGMEMEVAL_JUDGE_MODEL ?? 'gpt-5.5';
+
+function normalize(text: string): string {
+  return text
+    .toLowerCase()
+    .replace(/[^a-z0-9\s]/g, ' ')
+    .replace(/\s+/g, ' ')
+    .trim();
+}
+
+/**
+ * Mock judge: normalized substring/exact match against gold. Good enough to
+ * grade the bundled fixture deterministically without a model.
+ */
+export function createMockJudge(): Judge {
+  return {
+    name: 'mock-substring',
+    grade: async (_question: string, gold: string, predicted: string) => {
+      const g = normalize(gold);
+      const p = normalize(predicted);
+      // An empty prediction (e.g. a retrieval miss leaving the reader no
+      // context) must never grade correct — `g.includes('')` is always true.
+      if (g.length === 0 || p.length === 0) return false;
+      return p === g || p.includes(g) || g.includes(p);
+    },
+  };
+}
+
+/** Real judge: GPT grader returning correct/incorrect, LongMemEval-style. */
+export function createOpenAIJudge(apiKey: string): Judge {
+  return {
+    name: `openai-judge:${JUDGE_MODEL}`,
+    grade: async (question: string, gold: string, predicted: string) => {
+      const system =
+        'You are an impartial grader for a long-term memory QA benchmark. ' +
+        'Given a question, the gold answer, and a model answer, decide whether the ' +
+        'model answer is correct (conveys the same key information as the gold answer). ' +
+        'Reply with exactly one word: "correct" or "incorrect".';
+      const user = `Question: ${question}\nGold answer: ${gold}\nModel answer: ${predicted}\n\nVerdict:`;
+      const verdict = await chat(apiKey, JUDGE_MODEL, system, user);
+      const v = verdict.toLowerCase();
+      // `\bcorrect\b` does not match inside "incorrect" (no word boundary), so
+      // detect the verdict by whole word and reject negated/partial forms like
+      // "incorrect", "not correct", or "partially correct".
+      const saysCorrect = /\bcorrect\b/.test(v);
+      const negated = /\bincorrect\b/.test(v) || /\b(?:not|partially|isn't)\b[\s\S]{0,20}\bcorrect\b/.test(v);
+      return saysCorrect && !negated;
+    },
+  };
+}
diff --git a/packages/retrieval/eval/longmemeval/reader.ts b/packages/retrieval/eval/longmemeval/reader.ts
new file mode 100644
index 00000000..87660ee5
--- /dev/null
+++ b/packages/retrieval/eval/longmemeval/reader.ts
@@ -0,0 +1,81 @@
+import type { Reader } from './types';
+
+// Reader model. Defaults to gpt-5.4; override with LONGMEMEVAL_READER_MODEL to
+// run a like-for-like comparison config (e.g. gpt-4o) without editing the default.
+const CHAT_MODEL = process.env.LONGMEMEVAL_READER_MODEL ?? 'gpt-5.4';
+
+/**
+ * GPT-5-tier reasoning models reject a non-default `temperature`; callers must
+ * omit it for those models and keep deterministic `temperature: 0` elsewhere.
+ */
+function isGpt5Tier(model: string): boolean {
+  return /^gpt-5/i.test(model);
+}
+
+/** Mock reader: echo the top retrieved chunk's text as the answer. */
+export function createMockReader(): Reader {
+  return {
+    name: 'mock-top-hit',
+    answer: async (_question: string, contexts: string[]) => contexts[0] ?? '',
+  };
+}
+
+async function chat(
+  apiKey: string,
+  model: string,
+  system: string,
+  user: string,
+): Promise<string> {
+  const body: Record<string, unknown> = {
+    model,
+    messages: [
+      { role: 'system', content: system },
+      { role: 'user', content: user },
+    ],
+  };
+  // Deterministic grading where the model allows it; GPT-5-tier models reject a
+  // non-default temperature, so omit the field and let the API default stand.
+  if (!isGpt5Tier(model)) {
+    body.temperature = 0;
+  }
+  const res = await fetch('https://api.openai.com/v1/chat/completions', {
+    method: 'POST',
+    headers: {
+      'Content-Type': 'application/json',
+      Authorization: `Bearer ${apiKey}`,
+    },
+    body: JSON.stringify(body),
+  });
+  if (!res.ok) {
+    const body = await res.text();
+    throw new Error(`OpenAI chat failed (${res.status}): ${body.slice(0, 300)}`);
+  }
+  const json = (await res.json()) as { choices: { message: { content: string } }[] };
+  return json.choices[0].message.content.trim();
+}
+
+/** Real reader: answer the question from retrieved context. */
+export function createOpenAIReader(apiKey: string): Reader {
+  return {
+    name: `openai:${CHAT_MODEL}`,
+    answer: async (question: string, contexts: string[]) => {
+      const system =
+        'You answer questions about a user from the provided conversation excerpts. ' +
+        'Answer concisely. ' +
+        'For factual questions, give the answer stated in the excerpts; if the excerpts do not ' +
+        'contain it, say "I don\'t know". ' +
+        'For questions asking for a recommendation, suggestion, advice, or tips, infer and give a ' +
+        'recommendation grounded in what the excerpts reveal about this user\'s preferences, ' +
+        'context, and history. Base the recommendation only on preferences and signals actually ' +
+        'present in the excerpts — do not invent preferences or recommend from general knowledge ' +
+        'unsupported by the excerpts. If the excerpts reveal nothing relevant to the request, ' +
+        'say "I don\'t know".';
+      const user = `Conversation excerpts:\n${contexts
+        .map((c, i) => `[${i + 1}] ${c}`)
+        .join('\n\n')}\n\nQuestion: ${question}\nAnswer:`;
+      return chat(apiKey, CHAT_MODEL, system, user);
+    },
+  };
+}
+
+export { chat };
diff --git a/packages/retrieval/eval/longmemeval/run.ts b/packages/retrieval/eval/longmemeval/run.ts
new file mode 100644
index 00000000..0003ff4a
--- /dev/null
+++ b/packages/retrieval/eval/longmemeval/run.ts
@@ -0,0 +1,83 @@
+import { fileURLToPath } from 'node:url';
+import { dirname, join } from 'node:path';
+import { createMockEmbedder, createOpenAIEmbedder } from './embed';
+import { createMockStore, createRealStore } from './store';
+import { createMockReader, createOpenAIReader } from './reader';
+import { createMockJudge, createOpenAIJudge } from './judge';
+import { loadDataset } from './dataset';
+import { runEval, formatSummary } from './runner';
+import type { ChunkMode, Embedder, Judge, Reader, Store } from './types';
+
+function envInt(name: string, fallback: number): number {
+  const raw = process.env[name];
+  if (raw === undefined || raw === '') return fallback;
+  const value = parseInt(raw, 10);
+  return Number.isFinite(value) && value > 0 ? value : fallback;
+}
+
+async function main(): Promise<void> {
+  const apiKey = process.env.OPENAI_API_KEY;
+  const valkeyUrl = process.env.VALKEY_URL ?? 'redis://:devpassword@localhost:6384';
+  const dataPath = process.env.LONGMEMEVAL_DATA;
+  const limit = envInt('LONGMEMEVAL_LIMIT', 20);
+  const k = envInt('LONGMEMEVAL_K', 10);
+  const chunkMode: ChunkMode = process.env.LONGMEMEVAL_CHUNK === 'turn' ? 'turn' : 'session';
+  const qa = process.env.LONGMEMEVAL_QA === '1';
+
+  const cachePath = join(dirname(fileURLToPath(import.meta.url)), '.cache', 'embeddings.json');
+
+  // EMBEDDER seam.
+  let embedder: Embedder;
+  if (apiKey !== undefined && apiKey !== '') {
+    embedder = await createOpenAIEmbedder(apiKey, cachePath);
+  } else {
+    embedder = createMockEmbedder();
+  }
+
+  // STORE seam.
+  let store: Store | null = await createRealStore(valkeyUrl);
+  if (store === null) {
+    store = createMockStore();
+  }
+
+  // READER + JUDGE seams (Tier 2 only).
+  let reader: Reader | null = null;
+  let judge: Judge | null = null;
+  if (qa) {
+    if (apiKey !== undefined && apiKey !== '') {
+      reader = createOpenAIReader(apiKey);
+      judge = createOpenAIJudge(apiKey);
+    } else {
+      reader = createMockReader();
+      judge = createMockJudge();
+    }
+  }
+
+  const { records, source } = await loadDataset(dataPath);
+
+  const tier = qa ? 'Tier 2 (retrieval + QA)' : store.isReal || embedder.dims === 1536 ? 'Tier 1 (real recall)' : 'Tier 0 (offline)';
+
+  console.log('='.repeat(64));
+  console.log('LongMemEval retrieval harness — @betterdb/retrieval');
+  console.log('='.repeat(64));
+  console.log(`tier      : ${tier}`);
+  console.log(`embedder  : ${embedder.name}  (dims=${embedder.dims})`);
+  console.log(`store     : ${store.name}${store.isReal ? '' : '  (Valkey unreachable → mock)'}`);
+  console.log(`reader    : ${reader === null ? 'disabled' : reader.name}`);
+  console.log(`judge     : ${judge === null ? 'disabled' : judge.name}`);
+  console.log(`dataset   : ${source}  (${records.length} records)`);
+  console.log(`params    : limit=${limit} k=${k} chunk=${chunkMode} qa=${qa}`);
+  console.log('='.repeat(64));
+
+  try {
+    const summary = await runEval({ records, embedder, store, reader, judge, k, chunkMode, limit });
+    console.log(formatSummary(summary));
+  } finally {
+    await store.close();
+  }
+}
+
+main().catch((err) => {
+  console.error(err);
+  process.exit(1);
+});
diff --git a/packages/retrieval/eval/longmemeval/runner.ts b/packages/retrieval/eval/longmemeval/runner.ts
new file mode 100644
index 00000000..ad5a0f90
--- /dev/null
+++ b/packages/retrieval/eval/longmemeval/runner.ts
@@ -0,0 +1,191 @@
+import { Retriever } from '../../src/index';
+import type { RetrievalSchema } from '../../src/index';
+import { chunkRecord, recordIsHit } from './adapter';
+import type { ChunkMode, Embedder, Judge, LmeRecord, Reader, Store } from './types';
+
+export interface RunConfig {
+  records: LmeRecord[];
+  embedder: Embedder;
+  store: Store;
+  reader: Reader | null;
+  judge: Judge | null;
+  k: number;
+  chunkMode: ChunkMode;
+  limit: number;
+}
+
+export interface TypeStats {
+  type: string;
+  total: number;
+  recallHits: number;
+  qaCorrect: number;
+}
+
+export interface EvalSummary {
+  total: number;
+  recallHits: number;
+  recallAtK: number;
+  qaRun: boolean;
+  qaCorrect: number;
+  qaAccuracy: number;
+  k: number;
+  totalChunks: number;
+  byType: Map<string, TypeStats>;
+}
+
+const sleep = (ms: number): Promise<void> => new Promise((resolve) => setTimeout(resolve, ms));
+
+async function pollUntil(predicate: () => Promise<boolean>, attempts = 40): Promise<boolean> {
+  for (let i = 0; i < attempts; i++) {
+    if (await predicate()) return true;
+    await sleep(100);
+  }
+  return false;
+}
+
+function buildSchema(dims: number): RetrievalSchema {
+  return {
+    fields: {
+      session_id: { type: 'tag' },
+      date: { type: 'tag' },
+    },
+    vector: { metric: 'cosine', algorithm: 'hnsw', dims },
+  };
+}
+
+function bump(byType: Map<string, TypeStats>, type: string): TypeStats {
+  let stats = byType.get(type);
+  if (stats === undefined) {
+    stats = { type, total: 0, recallHits: 0, qaCorrect: 0 };
+    byType.set(type, stats);
+  }
+  return stats;
+}
+
+export async function runEval(config: RunConfig): Promise<EvalSummary> {
+  const { records, embedder, store, reader, judge, k, chunkMode, limit } = config;
+  const qaRun = reader !== null && judge !== null;
+  const schema = buildSchema(embedder.dims);
+  const byType = new Map<string, TypeStats>();
+
+  let total = 0;
+  let recallHits = 0;
+  let qaCorrect = 0;
+  let totalChunks = 0;
+
+  const slice = records.slice(0, limit);
+  // Flush in `finally` so a mid-run failure (embedding/Valkey/reader error)
+  // still persists the embeddings already computed — billable work must not be
+  // discarded just because the run didn't reach the end.
+  try {
+    for (let i = 0; i < slice.length; i++) {
+      const record = slice[i];
+      const name = `lme_${i}_${Math.random().toString(36).slice(2, 8)}`;
+      const retriever = new Retriever({
+        client: store.client,
+        name,
+        schema,
+        embedFn: embedder.embed,
+      });
+
+      const chunks = chunkRecord(record, chunkMode);
+      totalChunks += chunks.length;
+
+      await retriever.createIndex();
+      await retriever.upsert(chunks);
+
+      if (store.isReal) {
+        // A hit-count check can pass while HNSW is still backfilling. Wait for the
+        // index to report every chunk ingested and fully indexed so recall is not
+        // measured on an incomplete graph.
+        const settled = await pollUntil(async () => {
+          const h = await retriever.health();
+          // percentIndexed is normalized to a 0-100 scale; require full coverage.
+          return h.numDocs >= chunks.length && h.percentIndexed >= 100;
+        });
+        if (!settled) {
+          console.warn(
+            `index ${name} did not settle within the poll window (record ${i + 1}); recall may be undercounted`,
+          );
+        }
+      }
+
+      const hits = await retriever.query({ text: record.question, k });
+      const hit = recordIsHit(hits, record.answer_session_ids);
+
+      const stats = bump(byType, record.question_type);
+      stats.total++;
+      total++;
+      if (hit) {
+        stats.recallHits++;
+        recallHits++;
+      }
+
+      if (qaRun && reader !== null && judge !== null) {
+        // Temporal-reasoning questions need the session date (stored on the chunk's
+        // `date` tag) and the question's asked-on date in the prompt; passing only
+        // hit.text strips both and depresses temporal QA. Prefix each excerpt with
+        // its date and carry question_date into the question the reader sees.
+        const contexts = hits.map((h) => (h.fields.date ? `[${h.fields.date}] ${h.text}` : h.text));
+        const question =
+          record.question_date !== undefined && record.question_date !== ''
+            ? `${record.question} (question asked on ${record.question_date})`
+            : record.question;
+        const answer = await reader.answer(question, contexts);
+        // Grade against the same date-anchored question the reader saw, so the
+        // judge has the temporal anchor too and doesn't mismark temporal items.
+        const correct = await judge.grade(question, record.answer, answer);
+        if (correct) {
+          stats.qaCorrect++;
+          qaCorrect++;
+        }
+      }
+
+      await retriever.delete(chunks.map((c) => c.id)).catch(() => {});
+      await retriever.dropIndex().catch(() => {});
+    }
+  } finally {
+    await embedder.flush?.();
+  }
+
+  return {
+    total,
+    recallHits,
+    recallAtK: total > 0 ? recallHits / total : 0,
+    qaRun,
+    qaCorrect,
+    qaAccuracy: qaRun && total > 0 ? qaCorrect / total : 0,
+    k,
+    totalChunks,
+    byType,
+  };
+}
+
+export function formatSummary(summary: EvalSummary): string {
+  const lines: string[] = [];
+  const pct = (n: number): string => `${(n * 100).toFixed(1)}%`;
+  lines.push('');
+  lines.push(`Records: ${summary.total}   Chunks indexed: ${summary.totalChunks}   k=${summary.k}`);
+  lines.push('');
+
+  const header = summary.qaRun
+    ? 'question_type                         n   recall@k   QA-acc'
+    : 'question_type                         n   recall@k';
+  lines.push(header);
+  lines.push('-'.repeat(header.length));
+
+  const rows = Array.from(summary.byType.values()).sort((a, b) => a.type.localeCompare(b.type));
+  for (const row of rows) {
+    const recall = pct(row.total > 0 ? row.recallHits / row.total : 0);
+    const base = `${row.type.padEnd(36)} ${String(row.total).padStart(3)}   ${recall.padStart(8)}`;
+    lines.push(summary.qaRun ? `${base}   ${pct(row.qaCorrect / row.total).padStart(6)}` : base);
+  }
+
+  lines.push('-'.repeat(header.length));
+  const overall = `${'OVERALL'.padEnd(36)} ${String(summary.total).padStart(3)}   ${pct(
+    summary.recallAtK,
+  ).padStart(8)}`;
+  lines.push(summary.qaRun ? `${overall}   ${pct(summary.qaAccuracy).padStart(6)}` : overall);
+  lines.push('');
+  return lines.join('\n');
+}
diff --git a/packages/retrieval/eval/longmemeval/store.ts b/packages/retrieval/eval/longmemeval/store.ts
new file mode 100644
index 00000000..21cee4a2
--- /dev/null
+++ b/packages/retrieval/eval/longmemeval/store.ts
@@ -0,0 +1,311 @@
+import type { RetrieverClient } from '../../src/index';
+import type { Store } from './types';
+
+type FieldType = 'tag' | 'numeric' | 'text' | 'vector';
+
+interface IndexConfig {
+  name: string;
+  prefix: string;
+  vectorField: string;
+  dims: number;
+  fieldTypes: Record<string, FieldType>;
+}
+
+function decodeFloat32(value: unknown): number[] {
+  if (!Buffer.isBuffer(value)) {
+    return [];
+  }
+  const out: number[] = [];
+  for (let i = 0; i + 4 <= value.length; i += 4) {
+    out.push(value.readFloatLE(i));
+  }
+  return out;
+}
+
+function cosineDistance(a: number[], b: number[]): number {
+  let dot = 0;
+  let na = 0;
+  let nb = 0;
+  const len = Math.min(a.length, b.length);
+  for (let i = 0; i < len; i++) {
+    dot += a[i] * b[i];
+    na += a[i] * a[i];
+    nb += b[i] * b[i];
+  }
+  const denom = Math.sqrt(na) * Math.sqrt(nb) || 1;
+  return 1 - dot / denom;
+}
+
+function unescapeTag(value: string): string {
+  return value.replace(/\\(.)/g, '$1');
+}
+
+interface ParsedQuery {
+  k: number;
+  scoreField: string;
+  tagFilters: { field: string; value: string }[];
+  numericFilters: { field: string; value: number }[];
+}
+
+function parseQuery(queryString: string): ParsedQuery {
+  const knn = /\[KNN (\d+) @(\S+) \$vec AS (\S+)\]/.exec(queryString);
+  if (knn === null) {
+    throw new Error(`Mock FT.SEARCH could not parse KNN clause: ${queryString}`);
+  }
+  const k = parseInt(knn[1], 10);
+  const scoreField = knn[3];
+
+  const filterExpr = queryString.slice(0, queryString.indexOf('=>'));
+  const tagFilters: { field: string; value: string }[] = [];
+  const numericFilters: { field: string; value: number }[] = [];
+
+  if (filterExpr.trim() !== '*') {
+    const tagRe = /@(\w+):\{([^}]*)\}/g;
+    let m: RegExpExecArray | null;
+    while ((m = tagRe.exec(filterExpr)) !== null) {
+      tagFilters.push({ field: m[1], value: unescapeTag(m[2]) });
+    }
+    const numRe = /@(\w+):\[(\S+) (\S+)\]/g;
+    while ((m = numRe.exec(filterExpr)) !== null) {
+      numericFilters.push({ field: m[1], value: parseFloat(m[2]) });
+    }
+  }
+
+  return { k, scoreField, tagFilters, numericFilters };
+}
+
+/**
+ * In-memory store that implements the subset of the valkey-search command
+ * surface the Retriever uses. FT.SEARCH parses the query string the SDK builds,
+ * decodes the Float32 PARAMS vector, computes cosine distance to every stored
+ * vector, applies tag/numeric filters and returns the top-k in the exact reply
+ * shape parseFtSearchResponse expects. Behaves like FLAT-exact (≈ HNSW on small
+ * data), and is fully synchronous so Tier 0 is deterministic.
+ */
+class MockRetrieverClient implements RetrieverClient {
+  private readonly indexes = new Map<string, IndexConfig>();
+  private readonly hashes = new Map<string, Map<string, string | Buffer>>();
+
+  async call(command: string, ...args: (string | Buffer | number)[]): Promise<unknown> {
+    switch (command.toUpperCase()) {
+      case 'HSET':
+        return this.hset(args);
+      case 'DEL':
+        return this.del(args);
+      case 'HDEL':
+        return this.hdel(args);
+      case 'FT.CREATE':
+        return this.ftCreate(args);
+      case 'FT.INFO':
+        return this.ftInfo(args);
+      case 'FT.DROPINDEX':
+        return this.ftDropIndex(args);
+      case 'FT._LIST':
+        return Array.from(this.indexes.keys());
+      case 'FT.SEARCH':
+        return this.ftSearch(args);
+      default:
+        throw new Error(`MockRetrieverClient: unsupported command ${command}`);
+    }
+  }
+
+  private hset(args: (string | Buffer | number)[]): number {
+    const key = String(args[0]);
+    let map = this.hashes.get(key);
+    if (map === undefined) {
+      map = new Map();
+      this.hashes.set(key, map);
+    }
+    for (let i = 1; i + 1 < args.length; i += 2) {
+      const field = String(args[i]);
+      const value = args[i + 1];
+      map.set(field, Buffer.isBuffer(value) ? value : String(value));
+    }
+    return 1;
+  }
+
+  private del(args: (string | Buffer | number)[]): number {
+    let removed = 0;
+    for (const arg of args) {
+      if (this.hashes.delete(String(arg))) {
+        removed++;
+      }
+    }
+    return removed;
+  }
+
+  private hdel(args: (string | Buffer | number)[]): number {
+    const key = String(args[0]);
+    const map = this.hashes.get(key);
+    if (map === undefined) return 0;
+    let removed = 0;
+    for (let i = 1; i < args.length; i++) {
+      if (map.delete(String(args[i]))) removed++;
+    }
+    return removed;
+  }
+
+  private ftCreate(args: (string | Buffer | number)[]): string {
+    const tokens = args.map(String);
+    const name = tokens[0];
+    const prefixIdx = tokens.indexOf('PREFIX');
+    const prefix = tokens[prefixIdx + 2];
+    const schemaIdx = tokens.indexOf('SCHEMA');
+    const fieldTypes: Record<string, FieldType> = {};
+    let vectorField = 'embedding';
+    let dims = 0;
+
+    let i = schemaIdx + 1;
+    while (i < tokens.length) {
+      const fieldName = tokens[i];
+      const type = tokens[i + 1];
+      i += 2;
+      if (type === 'TAG') {
+        fieldTypes[fieldName] = 'tag';
+        if (tokens[i] === 'SEPARATOR') i += 2;
+      } else if (type === 'NUMERIC') {
+        fieldTypes[fieldName] = 'numeric';
+        if (tokens[i] === 'SORTABLE') i += 1;
+      } else if (type === 'TEXT') {
+        fieldTypes[fieldName] = 'text';
+      } else if (type === 'VECTOR') {
+        fieldTypes[fieldName] = 'vector';
+        vectorField = fieldName;
+        const count = parseInt(tokens[i + 1], 10);
+        const pairStart = i + 2;
+        for (let j = pairStart; j < pairStart + count; j += 2) {
+          if (tokens[j] === 'DIM') {
+            dims = parseInt(tokens[j + 1], 10);
+          }
+        }
+        i = pairStart + count;
+      } else {
+        // Unknown token; skip defensively.
+        i += 1;
+      }
+    }
+
+    this.indexes.set(name, { name, prefix, vectorField, dims, fieldTypes });
+    return 'OK';
+  }
+
+  private requireIndex(name: string): IndexConfig {
+    const cfg = this.indexes.get(name);
+    if (cfg === undefined) {
+      throw new Error(`Unknown index name: ${name}`);
+    }
+    return cfg;
+  }
+
+  private docsForIndex(cfg: IndexConfig): [string, Map<string, string | Buffer>][] {
+    return Array.from(this.hashes.entries()).filter(([key]) => key.startsWith(cfg.prefix));
+  }
+
+  private ftInfo(args: (string | Buffer | number)[]): unknown[] {
+    const name = String(args[0]);
+    const cfg = this.requireIndex(name);
+    const numDocs = this.docsForIndex(cfg).length;
+    return [
+      'index_name',
+      name,
+      'num_docs',
+      String(numDocs),
+      'indexing',
+      '0',
+      'percent_indexed',
+      '1',
+      'attributes',
+      [['identifier', cfg.vectorField, 'type', 'VECTOR', 'dim', String(cfg.dims)]],
+    ];
+  }
+
+  private ftDropIndex(args: (string | Buffer | number)[]): string {
+    const name = String(args[0]);
+    this.requireIndex(name);
+    this.indexes.delete(name);
+    return 'OK';
+  }
+
+  private ftSearch(args: (string | Buffer | number)[]): unknown[] {
+    const name = String(args[0]);
+    const cfg = this.requireIndex(name);
+    const queryString = String(args[1]);
+    const parsed = parseQuery(queryString);
+
+    const vecIdx = args.findIndex((a) => a === 'vec');
+    const queryVec = decodeFloat32(args[vecIdx + 1]);
+
+    const scored: { key: string; map: Map<string, string | Buffer>; distance: number }[] = [];
+    for (const [key, map] of this.docsForIndex(cfg)) {
+      let pass = true;
+      for (const f of parsed.tagFilters) {
+        if (String(map.get(f.field) ?? '') !== f.value) {
+          pass = false;
+          break;
+        }
+      }
+      if (pass) {
+        for (const f of parsed.numericFilters) {
+          if (Number(map.get(f.field)) !== f.value) {
+            pass = false;
+            break;
+          }
+        }
+      }
+      if (!pass) continue;
+      const docVec = decodeFloat32(map.get(cfg.vectorField));
+      scored.push({ key, map, distance: cosineDistance(queryVec, docVec) });
+    }
+
+    scored.sort((a, b) => a.distance - b.distance);
+    const top = scored.slice(0, parsed.k);
+
+    const reply: unknown[] = [top.length];
+    for (const hit of top) {
+      const fieldArray: string[] = [];
+      for (const [field, value] of hit.map.entries()) {
+        if (field === cfg.vectorField) continue;
+        fieldArray.push(field, value.toString());
+      }
+      fieldArray.push(parsed.scoreField, String(hit.distance));
+      reply.push(hit.key, fieldArray);
+    }
+    return reply;
+  }
+}
+
+export function createMockStore(): Store {
+  return {
+    name: 'mock-in-memory',
+    isReal: false,
+    client: new MockRetrieverClient(),
+    close: async () => {},
+  };
+}
+
+/**
+ * Real valkey-search store via iovalkey. Returns null if the server is
+ * unreachable or the search module (FT._LIST) is absent, so callers can fall
+ * back to the mock store gracefully (same skip-guard as Phase 6).
+ */
+export async function createRealStore(url: string): Promise<Store | null> {
+  const { default: Valkey } = await import('iovalkey');
+  const client = new Valkey(url, { lazyConnect: true, retryStrategy: () => null });
+  client.on('error', () => {});
+  try {
+    await client.connect();
+    await client.ping();
+    await client.call('FT._LIST');
+  } catch {
+    await client.quit().catch(() => {});
+    return null;
+  }
+  return {
+    name: `valkey-search@${url.replace(/:[^:@/]*@/, ':***@')}`,
+    isReal: true,
+    client: client as unknown as Store['client'],
+    close: async () => {
+      await client.quit().catch(() => {});
+    },
+  };
+}
diff --git a/packages/retrieval/eval/longmemeval/types.ts b/packages/retrieval/eval/longmemeval/types.ts
new file mode 100644
index 00000000..a24c548b
--- /dev/null
+++ b/packages/retrieval/eval/longmemeval/types.ts
@@ -0,0 +1,64 @@
+import type { RetrieverClient } from '../../src/index';
+
+/** A single turn in a LongMemEval session. */
+export interface LmeTurn {
+  role: 'user' | 'assistant';
+  content: string;
+  has_answer?: boolean;
+}
+
+/** One session is an ordered list of turns. */
+export type LmeSession = LmeTurn[];
+
+/**
+ * LongMemEval record shape (real dataset + bundled fixture).
+ * See https://github.com/xiaowu0162/LongMemEval
+ */
+export interface LmeRecord {
+  question_id: string;
+  question_type: string;
+  question: string;
+  answer: string;
+  question_date?: string;
+  haystack_session_ids: string[];
+  haystack_dates?: string[];
+  haystack_sessions: LmeSession[];
+  answer_session_ids: string[];
+}
+
+export type ChunkMode = 'session' | 'turn';
+
+/**
+ * EMBEDDER seam. `dims` MUST equal the length the `embed` function returns so
+ * the schema's vector.dims matches (OpenAI text-embedding-3-small = 1536).
+ */
+export interface Embedder {
+  name: string;
+  dims: number;
+  embed: (text: string) => Promise<number[]>;
+  /** Persist any cache to disk (no-op for the mock). */
+  flush?: () => Promise<void>;
+}
+
+/**
+ * STORE seam. `client` is handed to the Retriever. `isReal` drives async
+ * index-settling polling; mock stores are synchronous and exact.
+ */
+export interface Store {
+  name: string;
+  isReal: boolean;
+  client: RetrieverClient;
+  close: () => Promise<void>;
+}
+
+/** READER seam (Tier 2): generate an answer from retrieved context. */
+export interface Reader {
+  name: string;
+  answer: (question: string, contexts: string[]) => Promise<string>;
+}
+
+/** JUDGE seam (Tier 2): grade a generated answer against gold. */
+export interface Judge {
+  name: string;
+  grade: (question: string, gold: string, predicted: string) => Promise<boolean>;
+}
diff --git a/packages/retrieval/package.json b/packages/retrieval/package.json
index 05af4cd9..ff0a687c 100644
--- a/packages/retrieval/package.json
+++ b/packages/retrieval/package.json
@@ -32,6 +32,7 @@
     "typecheck": "tsc --noEmit",
     "test": "vitest run",
     "test:watch": "vitest",
+    "eval:longmemeval": "tsx eval/longmemeval/run.ts",
     "clean": "rm -rf dist"
   },
   "dependencies": {
@@ -41,6 +42,7 @@
   "devDependencies": {
     "@types/node": "^22.19.15",
     "iovalkey": "^0.3.3",
+    "tsx": "^4.19.2",
     "typescript": "^5.9.3",
     "vitest": "^4.1.1"
   },
diff --git a/packages/retrieval/src/__tests__/longmemeval.test.ts b/packages/retrieval/src/__tests__/longmemeval.test.ts
new file mode 100644
index 00000000..888792ac
--- /dev/null
+++ b/packages/retrieval/src/__tests__/longmemeval.test.ts
@@ -0,0 +1,83 @@
+import { describe, it, expect } from 'vitest';
+import { createMockEmbedder } from '../../eval/longmemeval/embed';
+import { createMockStore } from '../../eval/longmemeval/store';
+import { createMockReader } from '../../eval/longmemeval/reader';
+import { createMockJudge } from '../../eval/longmemeval/judge';
+import { loadFixture } from '../../eval/longmemeval/dataset';
+import { runEval } from '../../eval/longmemeval/runner';
+
+// Tier 0: fully offline (mock store + hashed embed), no keys/network/Docker.
+describe('longmemeval Tier 0 smoke', () => {
+  it('retrieves the evidence session above threshold on the fixture', async () => {
+    const records = await loadFixture();
+    const summary = await runEval({
+      records,
+      embedder: createMockEmbedder(),
+      store: createMockStore(),
+      reader: null,
+      judge: null,
+      k: 2,
+      chunkMode: 'session',
+      limit: 20,
+    });
+
+    expect(summary.total).toBe(records.length);
+    // Lexical mock embedding must rank the evidence session within the top-k.
+    expect(summary.recallAtK).toBeGreaterThanOrEqual(0.75);
+  });
+
+  it('is deterministic across runs', async () => {
+    const records = await loadFixture();
+    const run = (): ReturnType<typeof runEval> =>
+      runEval({
+        records,
+        embedder: createMockEmbedder(),
+        store: createMockStore(),
+        reader: null,
+        judge: null,
+        k: 2,
+        chunkMode: 'session',
+        limit: 20,
+      });
+
+    const a = await run();
+    const b = await run();
+    expect(a.recallHits).toBe(b.recallHits);
+    expect(a.recallAtK).toBe(b.recallAtK);
+  });
+
+  it('runs the mock reader+judge QA path end to end', async () => {
+    const records = await loadFixture();
+    const summary = await runEval({
+      records,
+      embedder: createMockEmbedder(),
+      store: createMockStore(),
+      reader: createMockReader(),
+      judge: createMockJudge(),
+      k: 2,
+      chunkMode: 'session',
+      limit: 20,
+    });
+
+    expect(summary.qaRun).toBe(true);
+    // Mock reader echoes the top hit; the evidence text contains the gold answer.
+    expect(summary.qaAccuracy).toBeGreaterThanOrEqual(0.75);
+  });
+
+  it('supports per-turn chunking', async () => {
+    const records = await loadFixture();
+    const summary = await runEval({
+      records,
+      embedder: createMockEmbedder(),
+      store: createMockStore(),
+      reader: null,
+      judge: null,
+      k: 3,
+      chunkMode: 'turn',
+      limit: 20,
+    });
+
+    expect(summary.total).toBe(records.length);
+    expect(summary.totalChunks).toBeGreaterThan(records.length);
+  });
+});
diff --git a/packages/retrieval/tsconfig.json b/packages/retrieval/tsconfig.json
index 78f3e3fd..7a9258f9 100644
--- a/packages/retrieval/tsconfig.json
+++ b/packages/retrieval/tsconfig.json
@@ -15,5 +15,5 @@
     "types": ["node"]
   },
   "include": ["src/**/*"],
-  "exclude": ["node_modules", "dist", "src/__tests__"]
+  "exclude": ["node_modules", "dist", "src/__tests__", "eval"]
 }
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index f0ba7073..8febc5ad 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -559,6 +559,9 @@ importers:
       iovalkey:
         specifier: ^0.3.3
         version: 0.3.3
+      tsx:
+        specifier: ^4.19.2
+        version: 4.21.0
       typescript:
         specifier: ^5.9.3
         version: 5.9.3