diff --git a/.gitignore b/.gitignore index bdce9acf..29288a44 100644 --- a/.gitignore +++ b/.gitignore @@ -82,3 +82,8 @@ packages/mcp/.mcpregistry_github_token .gitignore packages/mcp/.mcpregistry_registry_token data/license.key + +# LongMemEval retrieval harness: embedding cache + downloaded datasets +packages/retrieval/eval/longmemeval/.cache/ +packages/retrieval/eval/longmemeval/longmemeval_*.json +packages/retrieval/eval/longmemeval/*_oracle.json diff --git a/packages/retrieval/eval/longmemeval/adapter.ts b/packages/retrieval/eval/longmemeval/adapter.ts new file mode 100644 index 00000000..32389b36 --- /dev/null +++ b/packages/retrieval/eval/longmemeval/adapter.ts @@ -0,0 +1,97 @@ +import type { UpsertEntry, QueryHit } from '../../src/index'; +import type { ChunkMode, LmeRecord, LmeSession } from './types'; + +// text-embedding-3-small accepts at most 8191 tokens per input. Cap each chunk +// well under that (~4 chars/token heuristic, with margin) so a long session is +// split into multiple chunks instead of failing the embedding call. Every part +// keeps the session's session_id, so recall (which matches on session_id) is +// unaffected. +const MAX_EMBED_CHARS = 24000; + +/** Hard-slice a string into consecutive pieces each at most `budget` chars. */ +function sliceToBudget(text: string, budget: number): string[] { + const parts: string[] = []; + for (let i = 0; i < text.length; i += budget) { + parts.push(text.slice(i, i + budget)); + } + return parts; +} + +/** Pack a session's turns into newline-joined chunks each within `budget`. */ +function packTurns(session: LmeSession, budget: number): string[] { + const lines: string[] = []; + for (const turn of session) { + const line = `${turn.role}: ${turn.content}`; + if (line.length <= budget) { + lines.push(line); + } else { + // A single turn larger than the budget is hard-sliced so it still embeds. + lines.push(...sliceToBudget(line, budget)); + } + } + const chunks: string[] = []; + let current = ''; + for (const line of lines) { + if (current.length > 0 && current.length + 1 + line.length > budget) { + chunks.push(current); + current = line; + } else { + current = current.length === 0 ? line : `${current}\n${line}`; + } + } + if (current.length > 0) chunks.push(current); + return chunks; +} + +/** + * Turn a LongMemEval haystack into UpsertEntry chunks. + * - 'session' (default): one chunk per session (turns joined); sessions longer + * than the embedder's input budget are split into multiple chunks that all + * carry the same session_id. + * - 'turn': one chunk per turn. + * The id encodes the session index (+ turn/part index when split); fields carry + * the session_id tag (+ date tag when present) so recall can match evidence. + */ +export function chunkRecord(record: LmeRecord, mode: ChunkMode): UpsertEntry[] { + const entries: UpsertEntry[] = []; + record.haystack_sessions.forEach((session, sIdx) => { + const sessionId = record.haystack_session_ids[sIdx] ?? `session_${sIdx}`; + const date = record.haystack_dates?.[sIdx]; + const baseFields: Record = { session_id: sessionId }; + if (date !== undefined && date !== '') { + baseFields.date = date; + } + + if (mode === 'turn') { + session.forEach((turn, tIdx) => { + const text = `${turn.role}: ${turn.content}`; + // A single turn can exceed the embedder budget too; hard-slice it like + // session mode so it still embeds instead of failing the chunk. + const parts = text.length <= MAX_EMBED_CHARS ? [text] : sliceToBudget(text, MAX_EMBED_CHARS); + parts.forEach((part, pIdx) => { + entries.push({ + id: parts.length === 1 ? `s${sIdx}_t${tIdx}` : `s${sIdx}_t${tIdx}_p${pIdx}`, + text: part, + fields: { ...baseFields }, + }); + }); + }); + } else { + const parts = packTurns(session, MAX_EMBED_CHARS); + parts.forEach((text, pIdx) => { + entries.push({ + id: parts.length === 1 ? `s${sIdx}` : `s${sIdx}_p${pIdx}`, + text, + fields: { ...baseFields }, + }); + }); + } + }); + return entries; +} + +/** A record is a recall HIT if any retrieved chunk's session_id is evidence. */ +export function recordIsHit(hits: QueryHit[], answerSessionIds: string[]): boolean { + const evidence = new Set(answerSessionIds); + return hits.some((hit) => evidence.has(hit.fields.session_id)); +} diff --git a/packages/retrieval/eval/longmemeval/dataset.ts b/packages/retrieval/eval/longmemeval/dataset.ts new file mode 100644 index 00000000..1a8670ef --- /dev/null +++ b/packages/retrieval/eval/longmemeval/dataset.ts @@ -0,0 +1,28 @@ +import { readFile } from 'node:fs/promises'; +import { fileURLToPath } from 'node:url'; +import { dirname, join } from 'node:path'; +import type { LmeRecord } from './types'; + +function fixturePath(): string { + return join(dirname(fileURLToPath(import.meta.url)), 'fixture.json'); +} + +/** Load the bundled LongMemEval-shaped fixture (offline, deterministic). */ +export async function loadFixture(): Promise { + const raw = await readFile(fixturePath(), 'utf8'); + return JSON.parse(raw) as LmeRecord[]; +} + +/** + * Load the dataset: the real LongMemEval json at `dataPath` when given, else + * the bundled fixture. Returns records plus a human-readable source label. + */ +export async function loadDataset( + dataPath: string | undefined, +): Promise<{ records: LmeRecord[]; source: string }> { + if (dataPath !== undefined && dataPath !== '') { + const raw = await readFile(dataPath, 'utf8'); + return { records: JSON.parse(raw) as LmeRecord[], source: dataPath }; + } + return { records: await loadFixture(), source: 'bundled fixture' }; +} diff --git a/packages/retrieval/eval/longmemeval/embed.ts b/packages/retrieval/eval/longmemeval/embed.ts new file mode 100644 index 00000000..a7400e3c --- /dev/null +++ b/packages/retrieval/eval/longmemeval/embed.ts @@ -0,0 +1,120 @@ +import { createHash } from 'node:crypto'; +import { mkdir, readFile, writeFile } from 'node:fs/promises'; +import { dirname } from 'node:path'; +import type { Embedder } from './types'; + +const MOCK_DIM = 256; +const OPENAI_MODEL = 'text-embedding-3-small'; +const OPENAI_DIM = 1536; + +function tokenize(text: string): string[] { + return text + .toLowerCase() + .split(/[^a-z0-9]+/) + .filter((t) => t.length > 0); +} + +function l2normalize(vec: number[]): number[] { + const norm = Math.sqrt(vec.reduce((sum, v) => sum + v * v, 0)) || 1; + return vec.map((v) => v / norm); +} + +/** + * Deterministic hashed bag-of-words embedding. Each token is hashed into a few + * fixed dimensions; lexical overlap raises cosine similarity. Enough to prove + * ranking, not a real semantic score. No network, no keys. + */ +export function createMockEmbedder(dim = MOCK_DIM): Embedder { + return { + name: `mock-hashed-bow(dim=${dim})`, + dims: dim, + embed: async (text: string) => { + const vec = new Array(dim).fill(0); + for (const token of tokenize(text)) { + const h = createHash('sha256').update(token).digest(); + // Spread each token across 4 slots with signed weights. + for (let s = 0; s < 4; s++) { + const idx = h.readUInt32LE(s * 4) % dim; + const sign = (h[s * 4 + 3] & 1) === 0 ? 1 : -1; + vec[idx] += sign; + } + } + return l2normalize(vec); + }, + }; +} + +interface EmbedCache { + get(key: string): number[] | undefined; + set(key: string, vec: number[]): void; + flush(): Promise; +} + +async function loadCache(path: string): Promise { + let map = new Map(); + let dirty = false; + try { + const raw = await readFile(path, 'utf8'); + map = new Map(Object.entries(JSON.parse(raw) as Record)); + } catch { + // No cache yet; start empty. + } + return { + get: (key) => map.get(key), + set: (key, vec) => { + map.set(key, vec); + dirty = true; + }, + flush: async () => { + if (!dirty) return; + try { + await mkdir(dirname(path), { recursive: true }); + await writeFile(path, JSON.stringify(Object.fromEntries(map))); + dirty = false; + } catch (err) { + // The on-disk cache is only a cost optimization. At large scale the map + // can exceed V8's max string length when serialized; never let a flush + // failure discard an otherwise-completed eval — warn and continue. + console.warn(`embedding cache flush skipped: ${(err as Error).message}`); + } + }, + }; +} + +/** + * Real OpenAI text-embedding-3-small (1536 dims) with an on-disk, + * content-addressed cache so re-runs are cheap and indexing isn't re-billed. + */ +export async function createOpenAIEmbedder( + apiKey: string, + cachePath: string, +): Promise { + const cache = await loadCache(cachePath); + return { + name: `openai:${OPENAI_MODEL}`, + dims: OPENAI_DIM, + embed: async (text: string) => { + const key = createHash('sha256').update(`${OPENAI_MODEL}\n${text}`).digest('hex'); + const cached = cache.get(key); + if (cached !== undefined) return cached; + + const res = await fetch('https://api.openai.com/v1/embeddings', { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + Authorization: `Bearer ${apiKey}`, + }, + body: JSON.stringify({ model: OPENAI_MODEL, input: text }), + }); + if (!res.ok) { + const body = await res.text(); + throw new Error(`OpenAI embeddings failed (${res.status}): ${body.slice(0, 300)}`); + } + const json = (await res.json()) as { data: { embedding: number[] }[] }; + const vec = json.data[0].embedding; + cache.set(key, vec); + return vec; + }, + flush: () => cache.flush(), + }; +} diff --git a/packages/retrieval/eval/longmemeval/fixture.json b/packages/retrieval/eval/longmemeval/fixture.json new file mode 100644 index 00000000..e45a184b --- /dev/null +++ b/packages/retrieval/eval/longmemeval/fixture.json @@ -0,0 +1,130 @@ +[ + { + "question_id": "fix_001", + "question_type": "single-session-user", + "question": "What programming language do I use at work?", + "answer": "Python", + "question_date": "2024-05-01", + "haystack_session_ids": ["A_s0", "A_s1", "A_s2", "A_s3", "A_s4"], + "haystack_dates": ["2024-01-03", "2024-01-10", "2024-02-02", "2024-02-20", "2024-03-15"], + "haystack_sessions": [ + [ + { "role": "user", "content": "I tried a new ramen place downtown last night." }, + { "role": "assistant", "content": "How was the broth?" } + ], + [ + { "role": "user", "content": "At work I write Python every day for our data pipelines.", "has_answer": true }, + { "role": "assistant", "content": "Python is a solid choice for data engineering." } + ], + [ + { "role": "user", "content": "The weather has been rainy all week here." }, + { "role": "assistant", "content": "Hopefully it clears up soon." } + ], + [ + { "role": "user", "content": "I watched a documentary about deep sea creatures." }, + { "role": "assistant", "content": "Those are fascinating." } + ], + [ + { "role": "user", "content": "My knee felt better after the morning run." }, + { "role": "assistant", "content": "Glad the recovery is going well." } + ] + ], + "answer_session_ids": ["A_s1"] + }, + { + "question_id": "fix_002", + "question_type": "single-session-preference", + "question": "Which city did I book a flight to?", + "answer": "Tokyo", + "question_date": "2024-05-02", + "haystack_session_ids": ["B_s0", "B_s1", "B_s2", "B_s3", "B_s4"], + "haystack_dates": ["2024-01-05", "2024-01-22", "2024-02-11", "2024-03-01", "2024-03-30"], + "haystack_sessions": [ + [ + { "role": "user", "content": "I reorganized my bookshelf by color this weekend." }, + { "role": "assistant", "content": "That sounds satisfying." } + ], + [ + { "role": "user", "content": "My sourdough starter finally rose properly." }, + { "role": "assistant", "content": "Nice, persistence pays off." } + ], + [ + { "role": "user", "content": "I just booked a flight to Tokyo for next spring.", "has_answer": true }, + { "role": "assistant", "content": "Exciting, Tokyo in spring is beautiful." } + ], + [ + { "role": "user", "content": "The team standup ran long again today." }, + { "role": "assistant", "content": "Long meetings are draining." } + ], + [ + { "role": "user", "content": "I planted tomatoes and basil on the balcony." }, + { "role": "assistant", "content": "Fresh herbs are the best." } + ] + ], + "answer_session_ids": ["B_s2"] + }, + { + "question_id": "fix_003", + "question_type": "single-session-user", + "question": "What is my dog's name?", + "answer": "Rex", + "question_date": "2024-05-03", + "haystack_session_ids": ["C_s0", "C_s1", "C_s2", "C_s3", "C_s4"], + "haystack_dates": ["2024-01-08", "2024-01-19", "2024-02-14", "2024-02-28", "2024-03-21"], + "haystack_sessions": [ + [ + { "role": "user", "content": "I finished reading a long fantasy novel finally." }, + { "role": "assistant", "content": "Which series was it?" } + ], + [ + { "role": "user", "content": "My dog Rex loves to play fetch in the park every morning.", "has_answer": true }, + { "role": "assistant", "content": "Rex sounds full of energy." } + ], + [ + { "role": "user", "content": "I switched my coffee order to a flat white." }, + { "role": "assistant", "content": "A good choice." } + ], + [ + { "role": "user", "content": "The car needed an oil change this week." }, + { "role": "assistant", "content": "Routine maintenance helps." } + ], + [ + { "role": "user", "content": "I repainted the spare bedroom a soft green." }, + { "role": "assistant", "content": "Calming color." } + ] + ], + "answer_session_ids": ["C_s1"] + }, + { + "question_id": "fix_004", + "question_type": "multi-session", + "question": "How many siblings do I have?", + "answer": "two", + "question_date": "2024-05-04", + "haystack_session_ids": ["D_s0", "D_s1", "D_s2", "D_s3", "D_s4"], + "haystack_dates": ["2024-01-12", "2024-01-25", "2024-02-09", "2024-03-05", "2024-03-27"], + "haystack_sessions": [ + [ + { "role": "user", "content": "I upgraded my laptop's memory over the weekend." }, + { "role": "assistant", "content": "More RAM always helps." } + ], + [ + { "role": "user", "content": "We tried indoor rock climbing for the first time." }, + { "role": "assistant", "content": "How did it go?" } + ], + [ + { "role": "user", "content": "I grew up with two siblings, an older brother and a younger sister.", "has_answer": true }, + { "role": "assistant", "content": "A middle child then." } + ], + [ + { "role": "user", "content": "The bakery near my office started selling pretzels." }, + { "role": "assistant", "content": "Warm pretzels are great." } + ], + [ + { "role": "user", "content": "I set up a standing desk to fix my posture." }, + { "role": "assistant", "content": "Your back will thank you." } + ] + ], + "answer_session_ids": ["D_s2"] + } +] diff --git a/packages/retrieval/eval/longmemeval/judge.ts b/packages/retrieval/eval/longmemeval/judge.ts new file mode 100644 index 00000000..122e13b7 --- /dev/null +++ b/packages/retrieval/eval/longmemeval/judge.ts @@ -0,0 +1,55 @@ +import type { Judge } from './types'; +import { chat } from './reader'; + +// Judge model. Defaults to gpt-5.5; override with LONGMEMEVAL_JUDGE_MODEL to run +// a like-for-like comparison config (e.g. gpt-4o) without editing the default. +const JUDGE_MODEL = process.env.LONGMEMEVAL_JUDGE_MODEL ?? 'gpt-5.5'; + +function normalize(text: string): string { + return text + .toLowerCase() + .replace(/[^a-z0-9\s]/g, ' ') + .replace(/\s+/g, ' ') + .trim(); +} + +/** + * Mock judge: normalized substring/exact match against gold. Good enough to + * grade the bundled fixture deterministically without a model. + */ +export function createMockJudge(): Judge { + return { + name: 'mock-substring', + grade: async (_question: string, gold: string, predicted: string) => { + const g = normalize(gold); + const p = normalize(predicted); + // An empty prediction (e.g. a retrieval miss leaving the reader no + // context) must never grade correct — `g.includes('')` is always true. + if (g.length === 0 || p.length === 0) return false; + return p === g || p.includes(g) || g.includes(p); + }, + }; +} + +/** Real judge: GPT grader returning correct/incorrect, LongMemEval-style. */ +export function createOpenAIJudge(apiKey: string): Judge { + return { + name: `openai-judge:${JUDGE_MODEL}`, + grade: async (question: string, gold: string, predicted: string) => { + const system = + 'You are an impartial grader for a long-term memory QA benchmark. ' + + 'Given a question, the gold answer, and a model answer, decide whether the ' + + 'model answer is correct (conveys the same key information as the gold answer). ' + + 'Reply with exactly one word: "correct" or "incorrect".'; + const user = `Question: ${question}\nGold answer: ${gold}\nModel answer: ${predicted}\n\nVerdict:`; + const verdict = await chat(apiKey, JUDGE_MODEL, system, user); + const v = verdict.toLowerCase(); + // `\bcorrect\b` does not match inside "incorrect" (no word boundary), so + // detect the verdict by whole word and reject negated/partial forms like + // "incorrect", "not correct", or "partially correct". + const saysCorrect = /\bcorrect\b/.test(v); + const negated = /\bincorrect\b/.test(v) || /\b(?:not|partially|isn't)\b[\s\S]{0,20}\bcorrect\b/.test(v); + return saysCorrect && !negated; + }, + }; +} diff --git a/packages/retrieval/eval/longmemeval/reader.ts b/packages/retrieval/eval/longmemeval/reader.ts new file mode 100644 index 00000000..87660ee5 --- /dev/null +++ b/packages/retrieval/eval/longmemeval/reader.ts @@ -0,0 +1,81 @@ +import type { Reader } from './types'; + +// Reader model. Defaults to gpt-5.4; override with LONGMEMEVAL_READER_MODEL to +// run a like-for-like comparison config (e.g. gpt-4o) without editing the default. +const CHAT_MODEL = process.env.LONGMEMEVAL_READER_MODEL ?? 'gpt-5.4'; + +/** + * GPT-5-tier reasoning models reject a non-default `temperature`; callers must + * omit it for those models and keep deterministic `temperature: 0` elsewhere. + */ +function isGpt5Tier(model: string): boolean { + return /^gpt-5/i.test(model); +} + +/** Mock reader: echo the top retrieved chunk's text as the answer. */ +export function createMockReader(): Reader { + return { + name: 'mock-top-hit', + answer: async (_question: string, contexts: string[]) => contexts[0] ?? '', + }; +} + +async function chat( + apiKey: string, + model: string, + system: string, + user: string, +): Promise { + const body: Record = { + model, + messages: [ + { role: 'system', content: system }, + { role: 'user', content: user }, + ], + }; + // Deterministic grading where the model allows it; GPT-5-tier models reject a + // non-default temperature, so omit the field and let the API default stand. + if (!isGpt5Tier(model)) { + body.temperature = 0; + } + const res = await fetch('https://api.openai.com/v1/chat/completions', { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + Authorization: `Bearer ${apiKey}`, + }, + body: JSON.stringify(body), + }); + if (!res.ok) { + const body = await res.text(); + throw new Error(`OpenAI chat failed (${res.status}): ${body.slice(0, 300)}`); + } + const json = (await res.json()) as { choices: { message: { content: string } }[] }; + return json.choices[0].message.content.trim(); +} + +/** Real reader: answer the question from retrieved context. */ +export function createOpenAIReader(apiKey: string): Reader { + return { + name: `openai:${CHAT_MODEL}`, + answer: async (question: string, contexts: string[]) => { + const system = + 'You answer questions about a user from the provided conversation excerpts. ' + + 'Answer concisely. ' + + 'For factual questions, give the answer stated in the excerpts; if the excerpts do not ' + + 'contain it, say "I don\'t know". ' + + 'For questions asking for a recommendation, suggestion, advice, or tips, infer and give a ' + + 'recommendation grounded in what the excerpts reveal about this user\'s preferences, ' + + 'context, and history. Base the recommendation only on preferences and signals actually ' + + 'present in the excerpts — do not invent preferences or recommend from general knowledge ' + + 'unsupported by the excerpts. If the excerpts reveal nothing relevant to the request, ' + + 'say "I don\'t know".'; + const user = `Conversation excerpts:\n${contexts + .map((c, i) => `[${i + 1}] ${c}`) + .join('\n\n')}\n\nQuestion: ${question}\nAnswer:`; + return chat(apiKey, CHAT_MODEL, system, user); + }, + }; +} + +export { chat }; diff --git a/packages/retrieval/eval/longmemeval/run.ts b/packages/retrieval/eval/longmemeval/run.ts new file mode 100644 index 00000000..0003ff4a --- /dev/null +++ b/packages/retrieval/eval/longmemeval/run.ts @@ -0,0 +1,83 @@ +import { fileURLToPath } from 'node:url'; +import { dirname, join } from 'node:path'; +import { createMockEmbedder, createOpenAIEmbedder } from './embed'; +import { createMockStore, createRealStore } from './store'; +import { createMockReader, createOpenAIReader } from './reader'; +import { createMockJudge, createOpenAIJudge } from './judge'; +import { loadDataset } from './dataset'; +import { runEval, formatSummary } from './runner'; +import type { ChunkMode, Embedder, Judge, Reader, Store } from './types'; + +function envInt(name: string, fallback: number): number { + const raw = process.env[name]; + if (raw === undefined || raw === '') return fallback; + const value = parseInt(raw, 10); + return Number.isFinite(value) && value > 0 ? value : fallback; +} + +async function main(): Promise { + const apiKey = process.env.OPENAI_API_KEY; + const valkeyUrl = process.env.VALKEY_URL ?? 'redis://:devpassword@localhost:6384'; + const dataPath = process.env.LONGMEMEVAL_DATA; + const limit = envInt('LONGMEMEVAL_LIMIT', 20); + const k = envInt('LONGMEMEVAL_K', 10); + const chunkMode: ChunkMode = process.env.LONGMEMEVAL_CHUNK === 'turn' ? 'turn' : 'session'; + const qa = process.env.LONGMEMEVAL_QA === '1'; + + const cachePath = join(dirname(fileURLToPath(import.meta.url)), '.cache', 'embeddings.json'); + + // EMBEDDER seam. + let embedder: Embedder; + if (apiKey !== undefined && apiKey !== '') { + embedder = await createOpenAIEmbedder(apiKey, cachePath); + } else { + embedder = createMockEmbedder(); + } + + // STORE seam. + let store: Store | null = await createRealStore(valkeyUrl); + if (store === null) { + store = createMockStore(); + } + + // READER + JUDGE seams (Tier 2 only). + let reader: Reader | null = null; + let judge: Judge | null = null; + if (qa) { + if (apiKey !== undefined && apiKey !== '') { + reader = createOpenAIReader(apiKey); + judge = createOpenAIJudge(apiKey); + } else { + reader = createMockReader(); + judge = createMockJudge(); + } + } + + const { records, source } = await loadDataset(dataPath); + + const tier = qa ? 'Tier 2 (retrieval + QA)' : store.isReal || embedder.dims === 1536 ? 'Tier 1 (real recall)' : 'Tier 0 (offline)'; + + console.log('='.repeat(64)); + console.log('LongMemEval retrieval harness — @betterdb/retrieval'); + console.log('='.repeat(64)); + console.log(`tier : ${tier}`); + console.log(`embedder : ${embedder.name} (dims=${embedder.dims})`); + console.log(`store : ${store.name}${store.isReal ? '' : ' (Valkey unreachable → mock)'}`); + console.log(`reader : ${reader === null ? 'disabled' : reader.name}`); + console.log(`judge : ${judge === null ? 'disabled' : judge.name}`); + console.log(`dataset : ${source} (${records.length} records)`); + console.log(`params : limit=${limit} k=${k} chunk=${chunkMode} qa=${qa}`); + console.log('='.repeat(64)); + + try { + const summary = await runEval({ records, embedder, store, reader, judge, k, chunkMode, limit }); + console.log(formatSummary(summary)); + } finally { + await store.close(); + } +} + +main().catch((err) => { + console.error(err); + process.exit(1); +}); diff --git a/packages/retrieval/eval/longmemeval/runner.ts b/packages/retrieval/eval/longmemeval/runner.ts new file mode 100644 index 00000000..ad5a0f90 --- /dev/null +++ b/packages/retrieval/eval/longmemeval/runner.ts @@ -0,0 +1,191 @@ +import { Retriever } from '../../src/index'; +import type { RetrievalSchema } from '../../src/index'; +import { chunkRecord, recordIsHit } from './adapter'; +import type { ChunkMode, Embedder, Judge, LmeRecord, Reader, Store } from './types'; + +export interface RunConfig { + records: LmeRecord[]; + embedder: Embedder; + store: Store; + reader: Reader | null; + judge: Judge | null; + k: number; + chunkMode: ChunkMode; + limit: number; +} + +export interface TypeStats { + type: string; + total: number; + recallHits: number; + qaCorrect: number; +} + +export interface EvalSummary { + total: number; + recallHits: number; + recallAtK: number; + qaRun: boolean; + qaCorrect: number; + qaAccuracy: number; + k: number; + totalChunks: number; + byType: Map; +} + +const sleep = (ms: number): Promise => new Promise((resolve) => setTimeout(resolve, ms)); + +async function pollUntil(predicate: () => Promise, attempts = 40): Promise { + for (let i = 0; i < attempts; i++) { + if (await predicate()) return true; + await sleep(100); + } + return false; +} + +function buildSchema(dims: number): RetrievalSchema { + return { + fields: { + session_id: { type: 'tag' }, + date: { type: 'tag' }, + }, + vector: { metric: 'cosine', algorithm: 'hnsw', dims }, + }; +} + +function bump(byType: Map, type: string): TypeStats { + let stats = byType.get(type); + if (stats === undefined) { + stats = { type, total: 0, recallHits: 0, qaCorrect: 0 }; + byType.set(type, stats); + } + return stats; +} + +export async function runEval(config: RunConfig): Promise { + const { records, embedder, store, reader, judge, k, chunkMode, limit } = config; + const qaRun = reader !== null && judge !== null; + const schema = buildSchema(embedder.dims); + const byType = new Map(); + + let total = 0; + let recallHits = 0; + let qaCorrect = 0; + let totalChunks = 0; + + const slice = records.slice(0, limit); + // Flush in `finally` so a mid-run failure (embedding/Valkey/reader error) + // still persists the embeddings already computed — billable work must not be + // discarded just because the run didn't reach the end. + try { + for (let i = 0; i < slice.length; i++) { + const record = slice[i]; + const name = `lme_${i}_${Math.random().toString(36).slice(2, 8)}`; + const retriever = new Retriever({ + client: store.client, + name, + schema, + embedFn: embedder.embed, + }); + + const chunks = chunkRecord(record, chunkMode); + totalChunks += chunks.length; + + await retriever.createIndex(); + await retriever.upsert(chunks); + + if (store.isReal) { + // A hit-count check can pass while HNSW is still backfilling. Wait for the + // index to report every chunk ingested and fully indexed so recall is not + // measured on an incomplete graph. + const settled = await pollUntil(async () => { + const h = await retriever.health(); + // percentIndexed is normalized to a 0-100 scale; require full coverage. + return h.numDocs >= chunks.length && h.percentIndexed >= 100; + }); + if (!settled) { + console.warn( + `index ${name} did not settle within the poll window (record ${i + 1}); recall may be undercounted`, + ); + } + } + + const hits = await retriever.query({ text: record.question, k }); + const hit = recordIsHit(hits, record.answer_session_ids); + + const stats = bump(byType, record.question_type); + stats.total++; + total++; + if (hit) { + stats.recallHits++; + recallHits++; + } + + if (qaRun && reader !== null && judge !== null) { + // Temporal-reasoning questions need the session date (stored on the chunk's + // `date` tag) and the question's asked-on date in the prompt; passing only + // hit.text strips both and depresses temporal QA. Prefix each excerpt with + // its date and carry question_date into the question the reader sees. + const contexts = hits.map((h) => (h.fields.date ? `[${h.fields.date}] ${h.text}` : h.text)); + const question = + record.question_date !== undefined && record.question_date !== '' + ? `${record.question} (question asked on ${record.question_date})` + : record.question; + const answer = await reader.answer(question, contexts); + // Grade against the same date-anchored question the reader saw, so the + // judge has the temporal anchor too and doesn't mismark temporal items. + const correct = await judge.grade(question, record.answer, answer); + if (correct) { + stats.qaCorrect++; + qaCorrect++; + } + } + + await retriever.delete(chunks.map((c) => c.id)).catch(() => {}); + await retriever.dropIndex().catch(() => {}); + } + } finally { + await embedder.flush?.(); + } + + return { + total, + recallHits, + recallAtK: total > 0 ? recallHits / total : 0, + qaRun, + qaCorrect, + qaAccuracy: qaRun && total > 0 ? qaCorrect / total : 0, + k, + totalChunks, + byType, + }; +} + +export function formatSummary(summary: EvalSummary): string { + const lines: string[] = []; + const pct = (n: number): string => `${(n * 100).toFixed(1)}%`; + lines.push(''); + lines.push(`Records: ${summary.total} Chunks indexed: ${summary.totalChunks} k=${summary.k}`); + lines.push(''); + + const header = summary.qaRun + ? 'question_type n recall@k QA-acc' + : 'question_type n recall@k'; + lines.push(header); + lines.push('-'.repeat(header.length)); + + const rows = Array.from(summary.byType.values()).sort((a, b) => a.type.localeCompare(b.type)); + for (const row of rows) { + const recall = pct(row.total > 0 ? row.recallHits / row.total : 0); + const base = `${row.type.padEnd(36)} ${String(row.total).padStart(3)} ${recall.padStart(8)}`; + lines.push(summary.qaRun ? `${base} ${pct(row.qaCorrect / row.total).padStart(6)}` : base); + } + + lines.push('-'.repeat(header.length)); + const overall = `${'OVERALL'.padEnd(36)} ${String(summary.total).padStart(3)} ${pct( + summary.recallAtK, + ).padStart(8)}`; + lines.push(summary.qaRun ? `${overall} ${pct(summary.qaAccuracy).padStart(6)}` : overall); + lines.push(''); + return lines.join('\n'); +} diff --git a/packages/retrieval/eval/longmemeval/store.ts b/packages/retrieval/eval/longmemeval/store.ts new file mode 100644 index 00000000..21cee4a2 --- /dev/null +++ b/packages/retrieval/eval/longmemeval/store.ts @@ -0,0 +1,311 @@ +import type { RetrieverClient } from '../../src/index'; +import type { Store } from './types'; + +type FieldType = 'tag' | 'numeric' | 'text' | 'vector'; + +interface IndexConfig { + name: string; + prefix: string; + vectorField: string; + dims: number; + fieldTypes: Record; +} + +function decodeFloat32(value: unknown): number[] { + if (!Buffer.isBuffer(value)) { + return []; + } + const out: number[] = []; + for (let i = 0; i + 4 <= value.length; i += 4) { + out.push(value.readFloatLE(i)); + } + return out; +} + +function cosineDistance(a: number[], b: number[]): number { + let dot = 0; + let na = 0; + let nb = 0; + const len = Math.min(a.length, b.length); + for (let i = 0; i < len; i++) { + dot += a[i] * b[i]; + na += a[i] * a[i]; + nb += b[i] * b[i]; + } + const denom = Math.sqrt(na) * Math.sqrt(nb) || 1; + return 1 - dot / denom; +} + +function unescapeTag(value: string): string { + return value.replace(/\\(.)/g, '$1'); +} + +interface ParsedQuery { + k: number; + scoreField: string; + tagFilters: { field: string; value: string }[]; + numericFilters: { field: string; value: number }[]; +} + +function parseQuery(queryString: string): ParsedQuery { + const knn = /\[KNN (\d+) @(\S+) \$vec AS (\S+)\]/.exec(queryString); + if (knn === null) { + throw new Error(`Mock FT.SEARCH could not parse KNN clause: ${queryString}`); + } + const k = parseInt(knn[1], 10); + const scoreField = knn[3]; + + const filterExpr = queryString.slice(0, queryString.indexOf('=>')); + const tagFilters: { field: string; value: string }[] = []; + const numericFilters: { field: string; value: number }[] = []; + + if (filterExpr.trim() !== '*') { + const tagRe = /@(\w+):\{([^}]*)\}/g; + let m: RegExpExecArray | null; + while ((m = tagRe.exec(filterExpr)) !== null) { + tagFilters.push({ field: m[1], value: unescapeTag(m[2]) }); + } + const numRe = /@(\w+):\[(\S+) (\S+)\]/g; + while ((m = numRe.exec(filterExpr)) !== null) { + numericFilters.push({ field: m[1], value: parseFloat(m[2]) }); + } + } + + return { k, scoreField, tagFilters, numericFilters }; +} + +/** + * In-memory store that implements the subset of the valkey-search command + * surface the Retriever uses. FT.SEARCH parses the query string the SDK builds, + * decodes the Float32 PARAMS vector, computes cosine distance to every stored + * vector, applies tag/numeric filters and returns the top-k in the exact reply + * shape parseFtSearchResponse expects. Behaves like FLAT-exact (≈ HNSW on small + * data), and is fully synchronous so Tier 0 is deterministic. + */ +class MockRetrieverClient implements RetrieverClient { + private readonly indexes = new Map(); + private readonly hashes = new Map>(); + + async call(command: string, ...args: (string | Buffer | number)[]): Promise { + switch (command.toUpperCase()) { + case 'HSET': + return this.hset(args); + case 'DEL': + return this.del(args); + case 'HDEL': + return this.hdel(args); + case 'FT.CREATE': + return this.ftCreate(args); + case 'FT.INFO': + return this.ftInfo(args); + case 'FT.DROPINDEX': + return this.ftDropIndex(args); + case 'FT._LIST': + return Array.from(this.indexes.keys()); + case 'FT.SEARCH': + return this.ftSearch(args); + default: + throw new Error(`MockRetrieverClient: unsupported command ${command}`); + } + } + + private hset(args: (string | Buffer | number)[]): number { + const key = String(args[0]); + let map = this.hashes.get(key); + if (map === undefined) { + map = new Map(); + this.hashes.set(key, map); + } + for (let i = 1; i + 1 < args.length; i += 2) { + const field = String(args[i]); + const value = args[i + 1]; + map.set(field, Buffer.isBuffer(value) ? value : String(value)); + } + return 1; + } + + private del(args: (string | Buffer | number)[]): number { + let removed = 0; + for (const arg of args) { + if (this.hashes.delete(String(arg))) { + removed++; + } + } + return removed; + } + + private hdel(args: (string | Buffer | number)[]): number { + const key = String(args[0]); + const map = this.hashes.get(key); + if (map === undefined) return 0; + let removed = 0; + for (let i = 1; i < args.length; i++) { + if (map.delete(String(args[i]))) removed++; + } + return removed; + } + + private ftCreate(args: (string | Buffer | number)[]): string { + const tokens = args.map(String); + const name = tokens[0]; + const prefixIdx = tokens.indexOf('PREFIX'); + const prefix = tokens[prefixIdx + 2]; + const schemaIdx = tokens.indexOf('SCHEMA'); + const fieldTypes: Record = {}; + let vectorField = 'embedding'; + let dims = 0; + + let i = schemaIdx + 1; + while (i < tokens.length) { + const fieldName = tokens[i]; + const type = tokens[i + 1]; + i += 2; + if (type === 'TAG') { + fieldTypes[fieldName] = 'tag'; + if (tokens[i] === 'SEPARATOR') i += 2; + } else if (type === 'NUMERIC') { + fieldTypes[fieldName] = 'numeric'; + if (tokens[i] === 'SORTABLE') i += 1; + } else if (type === 'TEXT') { + fieldTypes[fieldName] = 'text'; + } else if (type === 'VECTOR') { + fieldTypes[fieldName] = 'vector'; + vectorField = fieldName; + const count = parseInt(tokens[i + 1], 10); + const pairStart = i + 2; + for (let j = pairStart; j < pairStart + count; j += 2) { + if (tokens[j] === 'DIM') { + dims = parseInt(tokens[j + 1], 10); + } + } + i = pairStart + count; + } else { + // Unknown token; skip defensively. + i += 1; + } + } + + this.indexes.set(name, { name, prefix, vectorField, dims, fieldTypes }); + return 'OK'; + } + + private requireIndex(name: string): IndexConfig { + const cfg = this.indexes.get(name); + if (cfg === undefined) { + throw new Error(`Unknown index name: ${name}`); + } + return cfg; + } + + private docsForIndex(cfg: IndexConfig): [string, Map][] { + return Array.from(this.hashes.entries()).filter(([key]) => key.startsWith(cfg.prefix)); + } + + private ftInfo(args: (string | Buffer | number)[]): unknown[] { + const name = String(args[0]); + const cfg = this.requireIndex(name); + const numDocs = this.docsForIndex(cfg).length; + return [ + 'index_name', + name, + 'num_docs', + String(numDocs), + 'indexing', + '0', + 'percent_indexed', + '1', + 'attributes', + [['identifier', cfg.vectorField, 'type', 'VECTOR', 'dim', String(cfg.dims)]], + ]; + } + + private ftDropIndex(args: (string | Buffer | number)[]): string { + const name = String(args[0]); + this.requireIndex(name); + this.indexes.delete(name); + return 'OK'; + } + + private ftSearch(args: (string | Buffer | number)[]): unknown[] { + const name = String(args[0]); + const cfg = this.requireIndex(name); + const queryString = String(args[1]); + const parsed = parseQuery(queryString); + + const vecIdx = args.findIndex((a) => a === 'vec'); + const queryVec = decodeFloat32(args[vecIdx + 1]); + + const scored: { key: string; map: Map; distance: number }[] = []; + for (const [key, map] of this.docsForIndex(cfg)) { + let pass = true; + for (const f of parsed.tagFilters) { + if (String(map.get(f.field) ?? '') !== f.value) { + pass = false; + break; + } + } + if (pass) { + for (const f of parsed.numericFilters) { + if (Number(map.get(f.field)) !== f.value) { + pass = false; + break; + } + } + } + if (!pass) continue; + const docVec = decodeFloat32(map.get(cfg.vectorField)); + scored.push({ key, map, distance: cosineDistance(queryVec, docVec) }); + } + + scored.sort((a, b) => a.distance - b.distance); + const top = scored.slice(0, parsed.k); + + const reply: unknown[] = [top.length]; + for (const hit of top) { + const fieldArray: string[] = []; + for (const [field, value] of hit.map.entries()) { + if (field === cfg.vectorField) continue; + fieldArray.push(field, value.toString()); + } + fieldArray.push(parsed.scoreField, String(hit.distance)); + reply.push(hit.key, fieldArray); + } + return reply; + } +} + +export function createMockStore(): Store { + return { + name: 'mock-in-memory', + isReal: false, + client: new MockRetrieverClient(), + close: async () => {}, + }; +} + +/** + * Real valkey-search store via iovalkey. Returns null if the server is + * unreachable or the search module (FT._LIST) is absent, so callers can fall + * back to the mock store gracefully (same skip-guard as Phase 6). + */ +export async function createRealStore(url: string): Promise { + const { default: Valkey } = await import('iovalkey'); + const client = new Valkey(url, { lazyConnect: true, retryStrategy: () => null }); + client.on('error', () => {}); + try { + await client.connect(); + await client.ping(); + await client.call('FT._LIST'); + } catch { + await client.quit().catch(() => {}); + return null; + } + return { + name: `valkey-search@${url.replace(/:[^:@/]*@/, ':***@')}`, + isReal: true, + client: client as unknown as Store['client'], + close: async () => { + await client.quit().catch(() => {}); + }, + }; +} diff --git a/packages/retrieval/eval/longmemeval/types.ts b/packages/retrieval/eval/longmemeval/types.ts new file mode 100644 index 00000000..a24c548b --- /dev/null +++ b/packages/retrieval/eval/longmemeval/types.ts @@ -0,0 +1,64 @@ +import type { RetrieverClient } from '../../src/index'; + +/** A single turn in a LongMemEval session. */ +export interface LmeTurn { + role: 'user' | 'assistant'; + content: string; + has_answer?: boolean; +} + +/** One session is an ordered list of turns. */ +export type LmeSession = LmeTurn[]; + +/** + * LongMemEval record shape (real dataset + bundled fixture). + * See https://github.com/xiaowu0162/LongMemEval + */ +export interface LmeRecord { + question_id: string; + question_type: string; + question: string; + answer: string; + question_date?: string; + haystack_session_ids: string[]; + haystack_dates?: string[]; + haystack_sessions: LmeSession[]; + answer_session_ids: string[]; +} + +export type ChunkMode = 'session' | 'turn'; + +/** + * EMBEDDER seam. `dims` MUST equal the length the `embed` function returns so + * the schema's vector.dims matches (OpenAI text-embedding-3-small = 1536). + */ +export interface Embedder { + name: string; + dims: number; + embed: (text: string) => Promise; + /** Persist any cache to disk (no-op for the mock). */ + flush?: () => Promise; +} + +/** + * STORE seam. `client` is handed to the Retriever. `isReal` drives async + * index-settling polling; mock stores are synchronous and exact. + */ +export interface Store { + name: string; + isReal: boolean; + client: RetrieverClient; + close: () => Promise; +} + +/** READER seam (Tier 2): generate an answer from retrieved context. */ +export interface Reader { + name: string; + answer: (question: string, contexts: string[]) => Promise; +} + +/** JUDGE seam (Tier 2): grade a generated answer against gold. */ +export interface Judge { + name: string; + grade: (question: string, gold: string, predicted: string) => Promise; +} diff --git a/packages/retrieval/package.json b/packages/retrieval/package.json index 05af4cd9..ff0a687c 100644 --- a/packages/retrieval/package.json +++ b/packages/retrieval/package.json @@ -32,6 +32,7 @@ "typecheck": "tsc --noEmit", "test": "vitest run", "test:watch": "vitest", + "eval:longmemeval": "tsx eval/longmemeval/run.ts", "clean": "rm -rf dist" }, "dependencies": { @@ -41,6 +42,7 @@ "devDependencies": { "@types/node": "^22.19.15", "iovalkey": "^0.3.3", + "tsx": "^4.19.2", "typescript": "^5.9.3", "vitest": "^4.1.1" }, diff --git a/packages/retrieval/src/__tests__/longmemeval.test.ts b/packages/retrieval/src/__tests__/longmemeval.test.ts new file mode 100644 index 00000000..888792ac --- /dev/null +++ b/packages/retrieval/src/__tests__/longmemeval.test.ts @@ -0,0 +1,83 @@ +import { describe, it, expect } from 'vitest'; +import { createMockEmbedder } from '../../eval/longmemeval/embed'; +import { createMockStore } from '../../eval/longmemeval/store'; +import { createMockReader } from '../../eval/longmemeval/reader'; +import { createMockJudge } from '../../eval/longmemeval/judge'; +import { loadFixture } from '../../eval/longmemeval/dataset'; +import { runEval } from '../../eval/longmemeval/runner'; + +// Tier 0: fully offline (mock store + hashed embed), no keys/network/Docker. +describe('longmemeval Tier 0 smoke', () => { + it('retrieves the evidence session above threshold on the fixture', async () => { + const records = await loadFixture(); + const summary = await runEval({ + records, + embedder: createMockEmbedder(), + store: createMockStore(), + reader: null, + judge: null, + k: 2, + chunkMode: 'session', + limit: 20, + }); + + expect(summary.total).toBe(records.length); + // Lexical mock embedding must rank the evidence session within the top-k. + expect(summary.recallAtK).toBeGreaterThanOrEqual(0.75); + }); + + it('is deterministic across runs', async () => { + const records = await loadFixture(); + const run = (): ReturnType => + runEval({ + records, + embedder: createMockEmbedder(), + store: createMockStore(), + reader: null, + judge: null, + k: 2, + chunkMode: 'session', + limit: 20, + }); + + const a = await run(); + const b = await run(); + expect(a.recallHits).toBe(b.recallHits); + expect(a.recallAtK).toBe(b.recallAtK); + }); + + it('runs the mock reader+judge QA path end to end', async () => { + const records = await loadFixture(); + const summary = await runEval({ + records, + embedder: createMockEmbedder(), + store: createMockStore(), + reader: createMockReader(), + judge: createMockJudge(), + k: 2, + chunkMode: 'session', + limit: 20, + }); + + expect(summary.qaRun).toBe(true); + // Mock reader echoes the top hit; the evidence text contains the gold answer. + expect(summary.qaAccuracy).toBeGreaterThanOrEqual(0.75); + }); + + it('supports per-turn chunking', async () => { + const records = await loadFixture(); + const summary = await runEval({ + records, + embedder: createMockEmbedder(), + store: createMockStore(), + reader: null, + judge: null, + k: 3, + chunkMode: 'turn', + limit: 20, + }); + + expect(summary.total).toBe(records.length); + expect(summary.totalChunks).toBeGreaterThan(records.length); + }); +}); diff --git a/packages/retrieval/tsconfig.json b/packages/retrieval/tsconfig.json index 78f3e3fd..7a9258f9 100644 --- a/packages/retrieval/tsconfig.json +++ b/packages/retrieval/tsconfig.json @@ -15,5 +15,5 @@ "types": ["node"] }, "include": ["src/**/*"], - "exclude": ["node_modules", "dist", "src/__tests__"] + "exclude": ["node_modules", "dist", "src/__tests__", "eval"] } diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index f0ba7073..8febc5ad 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -559,6 +559,9 @@ importers: iovalkey: specifier: ^0.3.3 version: 0.3.3 + tsx: + specifier: ^4.19.2 + version: 4.21.0 typescript: specifier: ^5.9.3 version: 5.9.3