BetterDB-inc · KIvanow · Jun 16, 2026 · Jun 16, 2026 · Jun 16, 2026 · Jun 16, 2026
diff --git a/.gitignore b/.gitignore
@@ -82,3 +82,8 @@ packages/mcp/.mcpregistry_github_token
 .gitignore
 packages/mcp/.mcpregistry_registry_token
 data/license.key
+
+# LongMemEval retrieval harness: embedding cache + downloaded datasets
+packages/retrieval/eval/longmemeval/.cache/
+packages/retrieval/eval/longmemeval/longmemeval_*.json
+packages/retrieval/eval/longmemeval/*_oracle.json
diff --git a/packages/retrieval/eval/longmemeval/adapter.ts b/packages/retrieval/eval/longmemeval/adapter.ts
@@ -0,0 +1,97 @@
+import type { UpsertEntry, QueryHit } from '../../src/index';
+import type { ChunkMode, LmeRecord, LmeSession } from './types';
+
+// text-embedding-3-small accepts at most 8191 tokens per input. Cap each chunk
+// well under that (~4 chars/token heuristic, with margin) so a long session is
+// split into multiple chunks instead of failing the embedding call. Every part
+// keeps the session's session_id, so recall (which matches on session_id) is
+// unaffected.
+const MAX_EMBED_CHARS = 24000;
+
+/** Hard-slice a string into consecutive pieces each at most `budget` chars. */
+function sliceToBudget(text: string, budget: number): string[] {
+  const parts: string[] = [];
+  for (let i = 0; i < text.length; i += budget) {
+    parts.push(text.slice(i, i + budget));
+  }
+  return parts;
+}
+
+/** Pack a session's turns into newline-joined chunks each within `budget`. */
+function packTurns(session: LmeSession, budget: number): string[] {
+  const lines: string[] = [];
+  for (const turn of session) {
+    const line = `${turn.role}: ${turn.content}`;
+    if (line.length <= budget) {
+      lines.push(line);
+    } else {
+      // A single turn larger than the budget is hard-sliced so it still embeds.
+      lines.push(...sliceToBudget(line, budget));
+    }
+  }
+  const chunks: string[] = [];
+  let current = '';
+  for (const line of lines) {
+    if (current.length > 0 && current.length + 1 + line.length > budget) {
+      chunks.push(current);
+      current = line;
+    } else {
+      current = current.length === 0 ? line : `${current}\n${line}`;
+    }
+  }
+  if (current.length > 0) chunks.push(current);
+  return chunks;
+}
+
+/**
+ * Turn a LongMemEval haystack into UpsertEntry chunks.
+ * - 'session' (default): one chunk per session (turns joined); sessions longer
+ *   than the embedder's input budget are split into multiple chunks that all
+ *   carry the same session_id.
+ * - 'turn': one chunk per turn.
+ * The id encodes the session index (+ turn/part index when split); fields carry
+ * the session_id tag (+ date tag when present) so recall can match evidence.
+ */
+export function chunkRecord(record: LmeRecord, mode: ChunkMode): UpsertEntry[] {
+  const entries: UpsertEntry[] = [];
+  record.haystack_sessions.forEach((session, sIdx) => {
+    const sessionId = record.haystack_session_ids[sIdx] ?? `session_${sIdx}`;
+    const date = record.haystack_dates?.[sIdx];
+    const baseFields: Record<string, string> = { session_id: sessionId };
+    if (date !== undefined && date !== '') {
+      baseFields.date = date;
+    }
+
+    if (mode === 'turn') {
+      session.forEach((turn, tIdx) => {
+        const text = `${turn.role}: ${turn.content}`;
+        // A single turn can exceed the embedder budget too; hard-slice it like
+        // session mode so it still embeds instead of failing the chunk.
+        const parts = text.length <= MAX_EMBED_CHARS ? [text] : sliceToBudget(text, MAX_EMBED_CHARS);
+        parts.forEach((part, pIdx) => {
+          entries.push({
+            id: parts.length === 1 ? `s${sIdx}_t${tIdx}` : `s${sIdx}_t${tIdx}_p${pIdx}`,
+            text: part,
+            fields: { ...baseFields },
+          });
+        });
+      });
+    } else {
+      const parts = packTurns(session, MAX_EMBED_CHARS);
+      parts.forEach((text, pIdx) => {
+        entries.push({
+          id: parts.length === 1 ? `s${sIdx}` : `s${sIdx}_p${pIdx}`,
+          text,
+          fields: { ...baseFields },
+        });
+      });
+    }
+  });
+  return entries;
+}
+
+/** A record is a recall HIT if any retrieved chunk's session_id is evidence. */
+export function recordIsHit(hits: QueryHit[], answerSessionIds: string[]): boolean {
+  const evidence = new Set(answerSessionIds);
+  return hits.some((hit) => evidence.has(hit.fields.session_id));
+}
diff --git a/packages/retrieval/eval/longmemeval/dataset.ts b/packages/retrieval/eval/longmemeval/dataset.ts
@@ -0,0 +1,28 @@
+import { readFile } from 'node:fs/promises';
+import { fileURLToPath } from 'node:url';
+import { dirname, join } from 'node:path';
+import type { LmeRecord } from './types';
+
+function fixturePath(): string {
+  return join(dirname(fileURLToPath(import.meta.url)), 'fixture.json');
+}
+
+/** Load the bundled LongMemEval-shaped fixture (offline, deterministic). */
+export async function loadFixture(): Promise<LmeRecord[]> {
+  const raw = await readFile(fixturePath(), 'utf8');
+  return JSON.parse(raw) as LmeRecord[];
+}
+
+/**
+ * Load the dataset: the real LongMemEval json at `dataPath` when given, else
+ * the bundled fixture. Returns records plus a human-readable source label.
+ */
+export async function loadDataset(
+  dataPath: string | undefined,
+): Promise<{ records: LmeRecord[]; source: string }> {
+  if (dataPath !== undefined && dataPath !== '') {
+    const raw = await readFile(dataPath, 'utf8');
+    return { records: JSON.parse(raw) as LmeRecord[], source: dataPath };
+  }
+  return { records: await loadFixture(), source: 'bundled fixture' };
+}
diff --git a/packages/retrieval/eval/longmemeval/embed.ts b/packages/retrieval/eval/longmemeval/embed.ts
@@ -0,0 +1,120 @@
+import { createHash } from 'node:crypto';
+import { mkdir, readFile, writeFile } from 'node:fs/promises';
+import { dirname } from 'node:path';
+import type { Embedder } from './types';
+
+const MOCK_DIM = 256;
+const OPENAI_MODEL = 'text-embedding-3-small';
+const OPENAI_DIM = 1536;
+
+function tokenize(text: string): string[] {
+  return text
+    .toLowerCase()
+    .split(/[^a-z0-9]+/)
+    .filter((t) => t.length > 0);
+}
+
+function l2normalize(vec: number[]): number[] {
+  const norm = Math.sqrt(vec.reduce((sum, v) => sum + v * v, 0)) || 1;
+  return vec.map((v) => v / norm);
+}
+
+/**
+ * Deterministic hashed bag-of-words embedding. Each token is hashed into a few
+ * fixed dimensions; lexical overlap raises cosine similarity. Enough to prove
+ * ranking, not a real semantic score. No network, no keys.
+ */
+export function createMockEmbedder(dim = MOCK_DIM): Embedder {
+  return {
+    name: `mock-hashed-bow(dim=${dim})`,
+    dims: dim,
+    embed: async (text: string) => {
+      const vec = new Array<number>(dim).fill(0);
+      for (const token of tokenize(text)) {
+        const h = createHash('sha256').update(token).digest();
+        // Spread each token across 4 slots with signed weights.
+        for (let s = 0; s < 4; s++) {
+          const idx = h.readUInt32LE(s * 4) % dim;
+          const sign = (h[s * 4 + 3] & 1) === 0 ? 1 : -1;
+          vec[idx] += sign;
+        }
+      }
+      return l2normalize(vec);
+    },
+  };
+}
+
+interface EmbedCache {
+  get(key: string): number[] | undefined;
+  set(key: string, vec: number[]): void;
+  flush(): Promise<void>;
+}
+
+async function loadCache(path: string): Promise<EmbedCache> {
+  let map = new Map<string, number[]>();
+  let dirty = false;
+  try {
+    const raw = await readFile(path, 'utf8');
+    map = new Map(Object.entries(JSON.parse(raw) as Record<string, number[]>));
+  } catch {
+    // No cache yet; start empty.
+  }
+  return {
+    get: (key) => map.get(key),
+    set: (key, vec) => {
+      map.set(key, vec);
+      dirty = true;
+    },
+    flush: async () => {
+      if (!dirty) return;
+      try {
+        await mkdir(dirname(path), { recursive: true });
+        await writeFile(path, JSON.stringify(Object.fromEntries(map)));
+        dirty = false;
+      } catch (err) {
+        // The on-disk cache is only a cost optimization. At large scale the map
+        // can exceed V8's max string length when serialized; never let a flush
+        // failure discard an otherwise-completed eval — warn and continue.
+        console.warn(`embedding cache flush skipped: ${(err as Error).message}`);
+      }
+    },
+  };
+}
+
+/**
+ * Real OpenAI text-embedding-3-small (1536 dims) with an on-disk,
+ * content-addressed cache so re-runs are cheap and indexing isn't re-billed.
+ */
+export async function createOpenAIEmbedder(
+  apiKey: string,
+  cachePath: string,
+): Promise<Embedder> {
+  const cache = await loadCache(cachePath);
+  return {
+    name: `openai:${OPENAI_MODEL}`,
+    dims: OPENAI_DIM,
+    embed: async (text: string) => {
+      const key = createHash('sha256').update(`${OPENAI_MODEL}\n${text}`).digest('hex');
+      const cached = cache.get(key);
+      if (cached !== undefined) return cached;
+
+      const res = await fetch('https://api.openai.com/v1/embeddings', {
+        method: 'POST',
+        headers: {
+          'Content-Type': 'application/json',
+          Authorization: `Bearer ${apiKey}`,
+        },
+        body: JSON.stringify({ model: OPENAI_MODEL, input: text }),
+      });
+      if (!res.ok) {
+        const body = await res.text();
+        throw new Error(`OpenAI embeddings failed (${res.status}): ${body.slice(0, 300)}`);
+      }
+      const json = (await res.json()) as { data: { embedding: number[] }[] };
+      const vec = json.data[0].embedding;
+      cache.set(key, vec);
+      return vec;
+    },
+    flush: () => cache.flush(),
+  };
+}
diff --git a/packages/retrieval/eval/longmemeval/fixture.json b/packages/retrieval/eval/longmemeval/fixture.json
@@ -0,0 +1,130 @@
+[
+  {
+    "question_id": "fix_001",
+    "question_type": "single-session-user",
+    "question": "What programming language do I use at work?",
+    "answer": "Python",
+    "question_date": "2024-05-01",
+    "haystack_session_ids": ["A_s0", "A_s1", "A_s2", "A_s3", "A_s4"],
+    "haystack_dates": ["2024-01-03", "2024-01-10", "2024-02-02", "2024-02-20", "2024-03-15"],
+    "haystack_sessions": [
+      [
+        { "role": "user", "content": "I tried a new ramen place downtown last night." },
+        { "role": "assistant", "content": "How was the broth?" }
+      ],
+      [
+        { "role": "user", "content": "At work I write Python every day for our data pipelines.", "has_answer": true },
+        { "role": "assistant", "content": "Python is a solid choice for data engineering." }
+      ],
+      [
+        { "role": "user", "content": "The weather has been rainy all week here." },
+        { "role": "assistant", "content": "Hopefully it clears up soon." }
+      ],
+      [
+        { "role": "user", "content": "I watched a documentary about deep sea creatures." },
+        { "role": "assistant", "content": "Those are fascinating." }
+      ],
+      [
+        { "role": "user", "content": "My knee felt better after the morning run." },
+        { "role": "assistant", "content": "Glad the recovery is going well." }
+      ]
+    ],
+    "answer_session_ids": ["A_s1"]
+  },
+  {
+    "question_id": "fix_002",
+    "question_type": "single-session-preference",
+    "question": "Which city did I book a flight to?",
+    "answer": "Tokyo",
+    "question_date": "2024-05-02",
+    "haystack_session_ids": ["B_s0", "B_s1", "B_s2", "B_s3", "B_s4"],
+    "haystack_dates": ["2024-01-05", "2024-01-22", "2024-02-11", "2024-03-01", "2024-03-30"],
+    "haystack_sessions": [
+      [
+        { "role": "user", "content": "I reorganized my bookshelf by color this weekend." },
+        { "role": "assistant", "content": "That sounds satisfying." }
+      ],
+      [
+        { "role": "user", "content": "My sourdough starter finally rose properly." },
+        { "role": "assistant", "content": "Nice, persistence pays off." }
+      ],
+      [
+        { "role": "user", "content": "I just booked a flight to Tokyo for next spring.", "has_answer": true },
+        { "role": "assistant", "content": "Exciting, Tokyo in spring is beautiful." }
+      ],
+      [
+        { "role": "user", "content": "The team standup ran long again today." },
+        { "role": "assistant", "content": "Long meetings are draining." }
+      ],
+      [
+        { "role": "user", "content": "I planted tomatoes and basil on the balcony." },
+        { "role": "assistant", "content": "Fresh herbs are the best." }
+      ]
+    ],
+    "answer_session_ids": ["B_s2"]
+  },
+  {
+    "question_id": "fix_003",
+    "question_type": "single-session-user",
+    "question": "What is my dog's name?",
+    "answer": "Rex",
+    "question_date": "2024-05-03",
+    "haystack_session_ids": ["C_s0", "C_s1", "C_s2", "C_s3", "C_s4"],
+    "haystack_dates": ["2024-01-08", "2024-01-19", "2024-02-14", "2024-02-28", "2024-03-21"],
+    "haystack_sessions": [
+      [
+        { "role": "user", "content": "I finished reading a long fantasy novel finally." },
+        { "role": "assistant", "content": "Which series was it?" }
+      ],
+      [
+        { "role": "user", "content": "My dog Rex loves to play fetch in the park every morning.", "has_answer": true },
+        { "role": "assistant", "content": "Rex sounds full of energy." }
+      ],
+      [
+        { "role": "user", "content": "I switched my coffee order to a flat white." },
+        { "role": "assistant", "content": "A good choice." }
+      ],
+      [
+        { "role": "user", "content": "The car needed an oil change this week." },
+        { "role": "assistant", "content": "Routine maintenance helps." }
+      ],
+      [
+        { "role": "user", "content": "I repainted the spare bedroom a soft green." },
+        { "role": "assistant", "content": "Calming color." }
+      ]
+    ],
+    "answer_session_ids": ["C_s1"]
+  },
+  {
+    "question_id": "fix_004",
+    "question_type": "multi-session",
+    "question": "How many siblings do I have?",
+    "answer": "two",
+    "question_date": "2024-05-04",
+    "haystack_session_ids": ["D_s0", "D_s1", "D_s2", "D_s3", "D_s4"],
+    "haystack_dates": ["2024-01-12", "2024-01-25", "2024-02-09", "2024-03-05", "2024-03-27"],
+    "haystack_sessions": [
+      [
+        { "role": "user", "content": "I upgraded my laptop's memory over the weekend." },
+        { "role": "assistant", "content": "More RAM always helps." }
+      ],
+      [
+        { "role": "user", "content": "We tried indoor rock climbing for the first time." },
+        { "role": "assistant", "content": "How did it go?" }
+      ],
+      [
+        { "role": "user", "content": "I grew up with two siblings, an older brother and a younger sister.", "has_answer": true },
+        { "role": "assistant", "content": "A middle child then." }
+      ],
+      [
+        { "role": "user", "content": "The bakery near my office started selling pretzels." },
+        { "role": "assistant", "content": "Warm pretzels are great." }
+      ],
+      [
+        { "role": "user", "content": "I set up a standing desk to fix my posture." },
+        { "role": "assistant", "content": "Your back will thank you." }
+      ]
+    ],
+    "answer_session_ids": ["D_s2"]
+  }
+]