Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -82,3 +82,8 @@ packages/mcp/.mcpregistry_github_token
.gitignore
packages/mcp/.mcpregistry_registry_token
data/license.key

# LongMemEval retrieval harness: embedding cache + downloaded datasets
packages/retrieval/eval/longmemeval/.cache/
packages/retrieval/eval/longmemeval/longmemeval_*.json
packages/retrieval/eval/longmemeval/*_oracle.json
97 changes: 97 additions & 0 deletions packages/retrieval/eval/longmemeval/adapter.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import type { UpsertEntry, QueryHit } from '../../src/index';
import type { ChunkMode, LmeRecord, LmeSession } from './types';

// text-embedding-3-small accepts at most 8191 tokens per input. Cap each chunk
// well under that (~4 chars/token heuristic, with margin) so a long session is
// split into multiple chunks instead of failing the embedding call. Every part
// keeps the session's session_id, so recall (which matches on session_id) is
// unaffected.
const MAX_EMBED_CHARS = 24000;

/** Hard-slice a string into consecutive pieces each at most `budget` chars. */
function sliceToBudget(text: string, budget: number): string[] {
const parts: string[] = [];
for (let i = 0; i < text.length; i += budget) {
parts.push(text.slice(i, i + budget));
}
return parts;
}

/** Pack a session's turns into newline-joined chunks each within `budget`. */
function packTurns(session: LmeSession, budget: number): string[] {
const lines: string[] = [];
for (const turn of session) {
const line = `${turn.role}: ${turn.content}`;
if (line.length <= budget) {
lines.push(line);
} else {
// A single turn larger than the budget is hard-sliced so it still embeds.
lines.push(...sliceToBudget(line, budget));
}
}
const chunks: string[] = [];
let current = '';
for (const line of lines) {
if (current.length > 0 && current.length + 1 + line.length > budget) {
chunks.push(current);
current = line;
} else {
current = current.length === 0 ? line : `${current}\n${line}`;
}
}
if (current.length > 0) chunks.push(current);
return chunks;
}

/**
* Turn a LongMemEval haystack into UpsertEntry chunks.
* - 'session' (default): one chunk per session (turns joined); sessions longer
* than the embedder's input budget are split into multiple chunks that all
* carry the same session_id.
* - 'turn': one chunk per turn.
* The id encodes the session index (+ turn/part index when split); fields carry
* the session_id tag (+ date tag when present) so recall can match evidence.
*/
export function chunkRecord(record: LmeRecord, mode: ChunkMode): UpsertEntry[] {
const entries: UpsertEntry[] = [];
record.haystack_sessions.forEach((session, sIdx) => {
const sessionId = record.haystack_session_ids[sIdx] ?? `session_${sIdx}`;
const date = record.haystack_dates?.[sIdx];
const baseFields: Record<string, string> = { session_id: sessionId };
if (date !== undefined && date !== '') {
baseFields.date = date;
}

if (mode === 'turn') {
session.forEach((turn, tIdx) => {
const text = `${turn.role}: ${turn.content}`;
// A single turn can exceed the embedder budget too; hard-slice it like
// session mode so it still embeds instead of failing the chunk.
const parts = text.length <= MAX_EMBED_CHARS ? [text] : sliceToBudget(text, MAX_EMBED_CHARS);
parts.forEach((part, pIdx) => {
entries.push({
id: parts.length === 1 ? `s${sIdx}_t${tIdx}` : `s${sIdx}_t${tIdx}_p${pIdx}`,
text: part,
fields: { ...baseFields },
});
});
});
Comment thread
cursor[bot] marked this conversation as resolved.
} else {
const parts = packTurns(session, MAX_EMBED_CHARS);
parts.forEach((text, pIdx) => {
entries.push({
id: parts.length === 1 ? `s${sIdx}` : `s${sIdx}_p${pIdx}`,
text,
fields: { ...baseFields },
});
});
Comment thread
cursor[bot] marked this conversation as resolved.
}
});
return entries;
}

/** A record is a recall HIT if any retrieved chunk's session_id is evidence. */
export function recordIsHit(hits: QueryHit[], answerSessionIds: string[]): boolean {
const evidence = new Set(answerSessionIds);
return hits.some((hit) => evidence.has(hit.fields.session_id));
}
28 changes: 28 additions & 0 deletions packages/retrieval/eval/longmemeval/dataset.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import { readFile } from 'node:fs/promises';
import { fileURLToPath } from 'node:url';
import { dirname, join } from 'node:path';
import type { LmeRecord } from './types';

function fixturePath(): string {
return join(dirname(fileURLToPath(import.meta.url)), 'fixture.json');
}

/** Load the bundled LongMemEval-shaped fixture (offline, deterministic). */
export async function loadFixture(): Promise<LmeRecord[]> {
const raw = await readFile(fixturePath(), 'utf8');
return JSON.parse(raw) as LmeRecord[];
}

/**
* Load the dataset: the real LongMemEval json at `dataPath` when given, else
* the bundled fixture. Returns records plus a human-readable source label.
*/
export async function loadDataset(
dataPath: string | undefined,
): Promise<{ records: LmeRecord[]; source: string }> {
if (dataPath !== undefined && dataPath !== '') {
const raw = await readFile(dataPath, 'utf8');
return { records: JSON.parse(raw) as LmeRecord[], source: dataPath };
}
return { records: await loadFixture(), source: 'bundled fixture' };
}
120 changes: 120 additions & 0 deletions packages/retrieval/eval/longmemeval/embed.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
import { createHash } from 'node:crypto';
import { mkdir, readFile, writeFile } from 'node:fs/promises';
import { dirname } from 'node:path';
import type { Embedder } from './types';

const MOCK_DIM = 256;
const OPENAI_MODEL = 'text-embedding-3-small';
const OPENAI_DIM = 1536;

function tokenize(text: string): string[] {
return text
.toLowerCase()
.split(/[^a-z0-9]+/)
.filter((t) => t.length > 0);
}

function l2normalize(vec: number[]): number[] {
const norm = Math.sqrt(vec.reduce((sum, v) => sum + v * v, 0)) || 1;
return vec.map((v) => v / norm);
}

/**
* Deterministic hashed bag-of-words embedding. Each token is hashed into a few
* fixed dimensions; lexical overlap raises cosine similarity. Enough to prove
* ranking, not a real semantic score. No network, no keys.
*/
export function createMockEmbedder(dim = MOCK_DIM): Embedder {
return {
name: `mock-hashed-bow(dim=${dim})`,
dims: dim,
embed: async (text: string) => {
const vec = new Array<number>(dim).fill(0);
for (const token of tokenize(text)) {
const h = createHash('sha256').update(token).digest();
// Spread each token across 4 slots with signed weights.
for (let s = 0; s < 4; s++) {
const idx = h.readUInt32LE(s * 4) % dim;
const sign = (h[s * 4 + 3] & 1) === 0 ? 1 : -1;
vec[idx] += sign;
}
}
return l2normalize(vec);
},
};
}

interface EmbedCache {
get(key: string): number[] | undefined;
set(key: string, vec: number[]): void;
flush(): Promise<void>;
}

async function loadCache(path: string): Promise<EmbedCache> {
let map = new Map<string, number[]>();
let dirty = false;
try {
const raw = await readFile(path, 'utf8');
map = new Map(Object.entries(JSON.parse(raw) as Record<string, number[]>));
} catch {
// No cache yet; start empty.
}
return {
get: (key) => map.get(key),
set: (key, vec) => {
map.set(key, vec);
dirty = true;
},
flush: async () => {
if (!dirty) return;
try {
await mkdir(dirname(path), { recursive: true });
await writeFile(path, JSON.stringify(Object.fromEntries(map)));
dirty = false;
} catch (err) {
// The on-disk cache is only a cost optimization. At large scale the map
// can exceed V8's max string length when serialized; never let a flush
// failure discard an otherwise-completed eval — warn and continue.
console.warn(`embedding cache flush skipped: ${(err as Error).message}`);
}
},
};
}

/**
* Real OpenAI text-embedding-3-small (1536 dims) with an on-disk,
* content-addressed cache so re-runs are cheap and indexing isn't re-billed.
*/
export async function createOpenAIEmbedder(
apiKey: string,
cachePath: string,
): Promise<Embedder> {
const cache = await loadCache(cachePath);
return {
name: `openai:${OPENAI_MODEL}`,
dims: OPENAI_DIM,
embed: async (text: string) => {
const key = createHash('sha256').update(`${OPENAI_MODEL}\n${text}`).digest('hex');
const cached = cache.get(key);
if (cached !== undefined) return cached;

const res = await fetch('https://api.openai.com/v1/embeddings', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
Authorization: `Bearer ${apiKey}`,
},
body: JSON.stringify({ model: OPENAI_MODEL, input: text }),
});
if (!res.ok) {
const body = await res.text();
throw new Error(`OpenAI embeddings failed (${res.status}): ${body.slice(0, 300)}`);
}
const json = (await res.json()) as { data: { embedding: number[] }[] };
const vec = json.data[0].embedding;
cache.set(key, vec);
return vec;
},
flush: () => cache.flush(),
};
}
130 changes: 130 additions & 0 deletions packages/retrieval/eval/longmemeval/fixture.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
[
{
"question_id": "fix_001",
"question_type": "single-session-user",
"question": "What programming language do I use at work?",
"answer": "Python",
"question_date": "2024-05-01",
"haystack_session_ids": ["A_s0", "A_s1", "A_s2", "A_s3", "A_s4"],
"haystack_dates": ["2024-01-03", "2024-01-10", "2024-02-02", "2024-02-20", "2024-03-15"],
"haystack_sessions": [
[
{ "role": "user", "content": "I tried a new ramen place downtown last night." },
{ "role": "assistant", "content": "How was the broth?" }
],
[
{ "role": "user", "content": "At work I write Python every day for our data pipelines.", "has_answer": true },
{ "role": "assistant", "content": "Python is a solid choice for data engineering." }
],
[
{ "role": "user", "content": "The weather has been rainy all week here." },
{ "role": "assistant", "content": "Hopefully it clears up soon." }
],
[
{ "role": "user", "content": "I watched a documentary about deep sea creatures." },
{ "role": "assistant", "content": "Those are fascinating." }
],
[
{ "role": "user", "content": "My knee felt better after the morning run." },
{ "role": "assistant", "content": "Glad the recovery is going well." }
]
],
"answer_session_ids": ["A_s1"]
},
{
"question_id": "fix_002",
"question_type": "single-session-preference",
"question": "Which city did I book a flight to?",
"answer": "Tokyo",
"question_date": "2024-05-02",
"haystack_session_ids": ["B_s0", "B_s1", "B_s2", "B_s3", "B_s4"],
"haystack_dates": ["2024-01-05", "2024-01-22", "2024-02-11", "2024-03-01", "2024-03-30"],
"haystack_sessions": [
[
{ "role": "user", "content": "I reorganized my bookshelf by color this weekend." },
{ "role": "assistant", "content": "That sounds satisfying." }
],
[
{ "role": "user", "content": "My sourdough starter finally rose properly." },
{ "role": "assistant", "content": "Nice, persistence pays off." }
],
[
{ "role": "user", "content": "I just booked a flight to Tokyo for next spring.", "has_answer": true },
{ "role": "assistant", "content": "Exciting, Tokyo in spring is beautiful." }
],
[
{ "role": "user", "content": "The team standup ran long again today." },
{ "role": "assistant", "content": "Long meetings are draining." }
],
[
{ "role": "user", "content": "I planted tomatoes and basil on the balcony." },
{ "role": "assistant", "content": "Fresh herbs are the best." }
]
],
"answer_session_ids": ["B_s2"]
},
{
"question_id": "fix_003",
"question_type": "single-session-user",
"question": "What is my dog's name?",
"answer": "Rex",
"question_date": "2024-05-03",
"haystack_session_ids": ["C_s0", "C_s1", "C_s2", "C_s3", "C_s4"],
"haystack_dates": ["2024-01-08", "2024-01-19", "2024-02-14", "2024-02-28", "2024-03-21"],
"haystack_sessions": [
[
{ "role": "user", "content": "I finished reading a long fantasy novel finally." },
{ "role": "assistant", "content": "Which series was it?" }
],
[
{ "role": "user", "content": "My dog Rex loves to play fetch in the park every morning.", "has_answer": true },
{ "role": "assistant", "content": "Rex sounds full of energy." }
],
[
{ "role": "user", "content": "I switched my coffee order to a flat white." },
{ "role": "assistant", "content": "A good choice." }
],
[
{ "role": "user", "content": "The car needed an oil change this week." },
{ "role": "assistant", "content": "Routine maintenance helps." }
],
[
{ "role": "user", "content": "I repainted the spare bedroom a soft green." },
{ "role": "assistant", "content": "Calming color." }
]
],
"answer_session_ids": ["C_s1"]
},
{
"question_id": "fix_004",
"question_type": "multi-session",
"question": "How many siblings do I have?",
"answer": "two",
"question_date": "2024-05-04",
"haystack_session_ids": ["D_s0", "D_s1", "D_s2", "D_s3", "D_s4"],
"haystack_dates": ["2024-01-12", "2024-01-25", "2024-02-09", "2024-03-05", "2024-03-27"],
"haystack_sessions": [
[
{ "role": "user", "content": "I upgraded my laptop's memory over the weekend." },
{ "role": "assistant", "content": "More RAM always helps." }
],
[
{ "role": "user", "content": "We tried indoor rock climbing for the first time." },
{ "role": "assistant", "content": "How did it go?" }
],
[
{ "role": "user", "content": "I grew up with two siblings, an older brother and a younger sister.", "has_answer": true },
{ "role": "assistant", "content": "A middle child then." }
],
[
{ "role": "user", "content": "The bakery near my office started selling pretzels." },
{ "role": "assistant", "content": "Warm pretzels are great." }
],
[
{ "role": "user", "content": "I set up a standing desk to fix my posture." },
{ "role": "assistant", "content": "Your back will thank you." }
]
],
"answer_session_ids": ["D_s2"]
}
]
Loading
Loading