-
Notifications
You must be signed in to change notification settings - Fork 74
feat(retrieval): LongMemEval evaluation harness #252
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
KIvanow
wants to merge
9
commits into
feature/retrieval-sdk-phase6-integration
Choose a base branch
from
feature/retrieval-longmemeval-harness
base: feature/retrieval-sdk-phase6-integration
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
Show all changes
9 commits
Select commit
Hold shift + click to select a range
2483192
feat(retrieval): LongMemEval evaluation harness
KIvanow 31ba6c7
fix(retrieval): harden LongMemEval judges and index polling
KIvanow c16e83c
fix(retrieval): gate LongMemEval recall on full index readiness
KIvanow e699e8f
fix(retrieval): correct index-settle scale, judge parsing, and long-s…
KIvanow 0f0da3d
fix(retrieval): cap turn-mode chunks and feed dates to the QA reader
KIvanow 0c10fa3
fix(retrieval): grade QA with the same date-anchored question as the …
KIvanow 1cb3782
fix(retrieval): persist embedding cache even when the eval run fails
KIvanow 62604cb
feat(retrieval): make LongMemEval reader/judge models env-overridable
KIvanow e3a8046
fix(retrieval): let LongMemEval reader infer grounded recommendations
KIvanow File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,97 @@ | ||
| import type { UpsertEntry, QueryHit } from '../../src/index'; | ||
| import type { ChunkMode, LmeRecord, LmeSession } from './types'; | ||
|
|
||
| // text-embedding-3-small accepts at most 8191 tokens per input. Cap each chunk | ||
| // well under that (~4 chars/token heuristic, with margin) so a long session is | ||
| // split into multiple chunks instead of failing the embedding call. Every part | ||
| // keeps the session's session_id, so recall (which matches on session_id) is | ||
| // unaffected. | ||
| const MAX_EMBED_CHARS = 24000; | ||
|
|
||
| /** Hard-slice a string into consecutive pieces each at most `budget` chars. */ | ||
| function sliceToBudget(text: string, budget: number): string[] { | ||
| const parts: string[] = []; | ||
| for (let i = 0; i < text.length; i += budget) { | ||
| parts.push(text.slice(i, i + budget)); | ||
| } | ||
| return parts; | ||
| } | ||
|
|
||
| /** Pack a session's turns into newline-joined chunks each within `budget`. */ | ||
| function packTurns(session: LmeSession, budget: number): string[] { | ||
| const lines: string[] = []; | ||
| for (const turn of session) { | ||
| const line = `${turn.role}: ${turn.content}`; | ||
| if (line.length <= budget) { | ||
| lines.push(line); | ||
| } else { | ||
| // A single turn larger than the budget is hard-sliced so it still embeds. | ||
| lines.push(...sliceToBudget(line, budget)); | ||
| } | ||
| } | ||
| const chunks: string[] = []; | ||
| let current = ''; | ||
| for (const line of lines) { | ||
| if (current.length > 0 && current.length + 1 + line.length > budget) { | ||
| chunks.push(current); | ||
| current = line; | ||
| } else { | ||
| current = current.length === 0 ? line : `${current}\n${line}`; | ||
| } | ||
| } | ||
| if (current.length > 0) chunks.push(current); | ||
| return chunks; | ||
| } | ||
|
|
||
| /** | ||
| * Turn a LongMemEval haystack into UpsertEntry chunks. | ||
| * - 'session' (default): one chunk per session (turns joined); sessions longer | ||
| * than the embedder's input budget are split into multiple chunks that all | ||
| * carry the same session_id. | ||
| * - 'turn': one chunk per turn. | ||
| * The id encodes the session index (+ turn/part index when split); fields carry | ||
| * the session_id tag (+ date tag when present) so recall can match evidence. | ||
| */ | ||
| export function chunkRecord(record: LmeRecord, mode: ChunkMode): UpsertEntry[] { | ||
| const entries: UpsertEntry[] = []; | ||
| record.haystack_sessions.forEach((session, sIdx) => { | ||
| const sessionId = record.haystack_session_ids[sIdx] ?? `session_${sIdx}`; | ||
| const date = record.haystack_dates?.[sIdx]; | ||
| const baseFields: Record<string, string> = { session_id: sessionId }; | ||
| if (date !== undefined && date !== '') { | ||
| baseFields.date = date; | ||
| } | ||
|
|
||
| if (mode === 'turn') { | ||
| session.forEach((turn, tIdx) => { | ||
| const text = `${turn.role}: ${turn.content}`; | ||
| // A single turn can exceed the embedder budget too; hard-slice it like | ||
| // session mode so it still embeds instead of failing the chunk. | ||
| const parts = text.length <= MAX_EMBED_CHARS ? [text] : sliceToBudget(text, MAX_EMBED_CHARS); | ||
| parts.forEach((part, pIdx) => { | ||
| entries.push({ | ||
| id: parts.length === 1 ? `s${sIdx}_t${tIdx}` : `s${sIdx}_t${tIdx}_p${pIdx}`, | ||
| text: part, | ||
| fields: { ...baseFields }, | ||
| }); | ||
| }); | ||
| }); | ||
| } else { | ||
| const parts = packTurns(session, MAX_EMBED_CHARS); | ||
| parts.forEach((text, pIdx) => { | ||
| entries.push({ | ||
| id: parts.length === 1 ? `s${sIdx}` : `s${sIdx}_p${pIdx}`, | ||
| text, | ||
| fields: { ...baseFields }, | ||
| }); | ||
| }); | ||
|
cursor[bot] marked this conversation as resolved.
|
||
| } | ||
| }); | ||
| return entries; | ||
| } | ||
|
|
||
| /** A record is a recall HIT if any retrieved chunk's session_id is evidence. */ | ||
| export function recordIsHit(hits: QueryHit[], answerSessionIds: string[]): boolean { | ||
| const evidence = new Set(answerSessionIds); | ||
| return hits.some((hit) => evidence.has(hit.fields.session_id)); | ||
| } | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,28 @@ | ||
| import { readFile } from 'node:fs/promises'; | ||
| import { fileURLToPath } from 'node:url'; | ||
| import { dirname, join } from 'node:path'; | ||
| import type { LmeRecord } from './types'; | ||
|
|
||
| function fixturePath(): string { | ||
| return join(dirname(fileURLToPath(import.meta.url)), 'fixture.json'); | ||
| } | ||
|
|
||
| /** Load the bundled LongMemEval-shaped fixture (offline, deterministic). */ | ||
| export async function loadFixture(): Promise<LmeRecord[]> { | ||
| const raw = await readFile(fixturePath(), 'utf8'); | ||
| return JSON.parse(raw) as LmeRecord[]; | ||
| } | ||
|
|
||
| /** | ||
| * Load the dataset: the real LongMemEval json at `dataPath` when given, else | ||
| * the bundled fixture. Returns records plus a human-readable source label. | ||
| */ | ||
| export async function loadDataset( | ||
| dataPath: string | undefined, | ||
| ): Promise<{ records: LmeRecord[]; source: string }> { | ||
| if (dataPath !== undefined && dataPath !== '') { | ||
| const raw = await readFile(dataPath, 'utf8'); | ||
| return { records: JSON.parse(raw) as LmeRecord[], source: dataPath }; | ||
| } | ||
| return { records: await loadFixture(), source: 'bundled fixture' }; | ||
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,120 @@ | ||
| import { createHash } from 'node:crypto'; | ||
| import { mkdir, readFile, writeFile } from 'node:fs/promises'; | ||
| import { dirname } from 'node:path'; | ||
| import type { Embedder } from './types'; | ||
|
|
||
| const MOCK_DIM = 256; | ||
| const OPENAI_MODEL = 'text-embedding-3-small'; | ||
| const OPENAI_DIM = 1536; | ||
|
|
||
| function tokenize(text: string): string[] { | ||
| return text | ||
| .toLowerCase() | ||
| .split(/[^a-z0-9]+/) | ||
| .filter((t) => t.length > 0); | ||
| } | ||
|
|
||
| function l2normalize(vec: number[]): number[] { | ||
| const norm = Math.sqrt(vec.reduce((sum, v) => sum + v * v, 0)) || 1; | ||
| return vec.map((v) => v / norm); | ||
| } | ||
|
|
||
| /** | ||
| * Deterministic hashed bag-of-words embedding. Each token is hashed into a few | ||
| * fixed dimensions; lexical overlap raises cosine similarity. Enough to prove | ||
| * ranking, not a real semantic score. No network, no keys. | ||
| */ | ||
| export function createMockEmbedder(dim = MOCK_DIM): Embedder { | ||
| return { | ||
| name: `mock-hashed-bow(dim=${dim})`, | ||
| dims: dim, | ||
| embed: async (text: string) => { | ||
| const vec = new Array<number>(dim).fill(0); | ||
| for (const token of tokenize(text)) { | ||
| const h = createHash('sha256').update(token).digest(); | ||
| // Spread each token across 4 slots with signed weights. | ||
| for (let s = 0; s < 4; s++) { | ||
| const idx = h.readUInt32LE(s * 4) % dim; | ||
| const sign = (h[s * 4 + 3] & 1) === 0 ? 1 : -1; | ||
| vec[idx] += sign; | ||
| } | ||
| } | ||
| return l2normalize(vec); | ||
| }, | ||
| }; | ||
| } | ||
|
|
||
| interface EmbedCache { | ||
| get(key: string): number[] | undefined; | ||
| set(key: string, vec: number[]): void; | ||
| flush(): Promise<void>; | ||
| } | ||
|
|
||
| async function loadCache(path: string): Promise<EmbedCache> { | ||
| let map = new Map<string, number[]>(); | ||
| let dirty = false; | ||
| try { | ||
| const raw = await readFile(path, 'utf8'); | ||
| map = new Map(Object.entries(JSON.parse(raw) as Record<string, number[]>)); | ||
| } catch { | ||
| // No cache yet; start empty. | ||
| } | ||
| return { | ||
| get: (key) => map.get(key), | ||
| set: (key, vec) => { | ||
| map.set(key, vec); | ||
| dirty = true; | ||
| }, | ||
| flush: async () => { | ||
| if (!dirty) return; | ||
| try { | ||
| await mkdir(dirname(path), { recursive: true }); | ||
| await writeFile(path, JSON.stringify(Object.fromEntries(map))); | ||
| dirty = false; | ||
| } catch (err) { | ||
| // The on-disk cache is only a cost optimization. At large scale the map | ||
| // can exceed V8's max string length when serialized; never let a flush | ||
| // failure discard an otherwise-completed eval — warn and continue. | ||
| console.warn(`embedding cache flush skipped: ${(err as Error).message}`); | ||
| } | ||
| }, | ||
| }; | ||
| } | ||
|
|
||
| /** | ||
| * Real OpenAI text-embedding-3-small (1536 dims) with an on-disk, | ||
| * content-addressed cache so re-runs are cheap and indexing isn't re-billed. | ||
| */ | ||
| export async function createOpenAIEmbedder( | ||
| apiKey: string, | ||
| cachePath: string, | ||
| ): Promise<Embedder> { | ||
| const cache = await loadCache(cachePath); | ||
| return { | ||
| name: `openai:${OPENAI_MODEL}`, | ||
| dims: OPENAI_DIM, | ||
| embed: async (text: string) => { | ||
| const key = createHash('sha256').update(`${OPENAI_MODEL}\n${text}`).digest('hex'); | ||
| const cached = cache.get(key); | ||
| if (cached !== undefined) return cached; | ||
|
|
||
| const res = await fetch('https://api.openai.com/v1/embeddings', { | ||
| method: 'POST', | ||
| headers: { | ||
| 'Content-Type': 'application/json', | ||
| Authorization: `Bearer ${apiKey}`, | ||
| }, | ||
| body: JSON.stringify({ model: OPENAI_MODEL, input: text }), | ||
| }); | ||
| if (!res.ok) { | ||
| const body = await res.text(); | ||
| throw new Error(`OpenAI embeddings failed (${res.status}): ${body.slice(0, 300)}`); | ||
| } | ||
| const json = (await res.json()) as { data: { embedding: number[] }[] }; | ||
| const vec = json.data[0].embedding; | ||
| cache.set(key, vec); | ||
| return vec; | ||
| }, | ||
| flush: () => cache.flush(), | ||
| }; | ||
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,130 @@ | ||
| [ | ||
| { | ||
| "question_id": "fix_001", | ||
| "question_type": "single-session-user", | ||
| "question": "What programming language do I use at work?", | ||
| "answer": "Python", | ||
| "question_date": "2024-05-01", | ||
| "haystack_session_ids": ["A_s0", "A_s1", "A_s2", "A_s3", "A_s4"], | ||
| "haystack_dates": ["2024-01-03", "2024-01-10", "2024-02-02", "2024-02-20", "2024-03-15"], | ||
| "haystack_sessions": [ | ||
| [ | ||
| { "role": "user", "content": "I tried a new ramen place downtown last night." }, | ||
| { "role": "assistant", "content": "How was the broth?" } | ||
| ], | ||
| [ | ||
| { "role": "user", "content": "At work I write Python every day for our data pipelines.", "has_answer": true }, | ||
| { "role": "assistant", "content": "Python is a solid choice for data engineering." } | ||
| ], | ||
| [ | ||
| { "role": "user", "content": "The weather has been rainy all week here." }, | ||
| { "role": "assistant", "content": "Hopefully it clears up soon." } | ||
| ], | ||
| [ | ||
| { "role": "user", "content": "I watched a documentary about deep sea creatures." }, | ||
| { "role": "assistant", "content": "Those are fascinating." } | ||
| ], | ||
| [ | ||
| { "role": "user", "content": "My knee felt better after the morning run." }, | ||
| { "role": "assistant", "content": "Glad the recovery is going well." } | ||
| ] | ||
| ], | ||
| "answer_session_ids": ["A_s1"] | ||
| }, | ||
| { | ||
| "question_id": "fix_002", | ||
| "question_type": "single-session-preference", | ||
| "question": "Which city did I book a flight to?", | ||
| "answer": "Tokyo", | ||
| "question_date": "2024-05-02", | ||
| "haystack_session_ids": ["B_s0", "B_s1", "B_s2", "B_s3", "B_s4"], | ||
| "haystack_dates": ["2024-01-05", "2024-01-22", "2024-02-11", "2024-03-01", "2024-03-30"], | ||
| "haystack_sessions": [ | ||
| [ | ||
| { "role": "user", "content": "I reorganized my bookshelf by color this weekend." }, | ||
| { "role": "assistant", "content": "That sounds satisfying." } | ||
| ], | ||
| [ | ||
| { "role": "user", "content": "My sourdough starter finally rose properly." }, | ||
| { "role": "assistant", "content": "Nice, persistence pays off." } | ||
| ], | ||
| [ | ||
| { "role": "user", "content": "I just booked a flight to Tokyo for next spring.", "has_answer": true }, | ||
| { "role": "assistant", "content": "Exciting, Tokyo in spring is beautiful." } | ||
| ], | ||
| [ | ||
| { "role": "user", "content": "The team standup ran long again today." }, | ||
| { "role": "assistant", "content": "Long meetings are draining." } | ||
| ], | ||
| [ | ||
| { "role": "user", "content": "I planted tomatoes and basil on the balcony." }, | ||
| { "role": "assistant", "content": "Fresh herbs are the best." } | ||
| ] | ||
| ], | ||
| "answer_session_ids": ["B_s2"] | ||
| }, | ||
| { | ||
| "question_id": "fix_003", | ||
| "question_type": "single-session-user", | ||
| "question": "What is my dog's name?", | ||
| "answer": "Rex", | ||
| "question_date": "2024-05-03", | ||
| "haystack_session_ids": ["C_s0", "C_s1", "C_s2", "C_s3", "C_s4"], | ||
| "haystack_dates": ["2024-01-08", "2024-01-19", "2024-02-14", "2024-02-28", "2024-03-21"], | ||
| "haystack_sessions": [ | ||
| [ | ||
| { "role": "user", "content": "I finished reading a long fantasy novel finally." }, | ||
| { "role": "assistant", "content": "Which series was it?" } | ||
| ], | ||
| [ | ||
| { "role": "user", "content": "My dog Rex loves to play fetch in the park every morning.", "has_answer": true }, | ||
| { "role": "assistant", "content": "Rex sounds full of energy." } | ||
| ], | ||
| [ | ||
| { "role": "user", "content": "I switched my coffee order to a flat white." }, | ||
| { "role": "assistant", "content": "A good choice." } | ||
| ], | ||
| [ | ||
| { "role": "user", "content": "The car needed an oil change this week." }, | ||
| { "role": "assistant", "content": "Routine maintenance helps." } | ||
| ], | ||
| [ | ||
| { "role": "user", "content": "I repainted the spare bedroom a soft green." }, | ||
| { "role": "assistant", "content": "Calming color." } | ||
| ] | ||
| ], | ||
| "answer_session_ids": ["C_s1"] | ||
| }, | ||
| { | ||
| "question_id": "fix_004", | ||
| "question_type": "multi-session", | ||
| "question": "How many siblings do I have?", | ||
| "answer": "two", | ||
| "question_date": "2024-05-04", | ||
| "haystack_session_ids": ["D_s0", "D_s1", "D_s2", "D_s3", "D_s4"], | ||
| "haystack_dates": ["2024-01-12", "2024-01-25", "2024-02-09", "2024-03-05", "2024-03-27"], | ||
| "haystack_sessions": [ | ||
| [ | ||
| { "role": "user", "content": "I upgraded my laptop's memory over the weekend." }, | ||
| { "role": "assistant", "content": "More RAM always helps." } | ||
| ], | ||
| [ | ||
| { "role": "user", "content": "We tried indoor rock climbing for the first time." }, | ||
| { "role": "assistant", "content": "How did it go?" } | ||
| ], | ||
| [ | ||
| { "role": "user", "content": "I grew up with two siblings, an older brother and a younger sister.", "has_answer": true }, | ||
| { "role": "assistant", "content": "A middle child then." } | ||
| ], | ||
| [ | ||
| { "role": "user", "content": "The bakery near my office started selling pretzels." }, | ||
| { "role": "assistant", "content": "Warm pretzels are great." } | ||
| ], | ||
| [ | ||
| { "role": "user", "content": "I set up a standing desk to fix my posture." }, | ||
| { "role": "assistant", "content": "Your back will thank you." } | ||
| ] | ||
| ], | ||
| "answer_session_ids": ["D_s2"] | ||
| } | ||
| ] |
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.