From 162b17dd6955406a78e65374513c6d2da27b2c41 Mon Sep 17 00:00:00 2001 From: Brian McMahon Date: Thu, 21 May 2026 17:21:15 -0700 Subject: [PATCH] feat(salience): dedup near-duplicate memories by source_key / title MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Caught 2026-05-21 on first real prod-vault scoring output: 3 of the auto-selected top-10 were the same memory ("System-wide deploy changelog" at ids 600, 617, 644 — same title, content edited iteratively across sessions, three different content_hashes). Bare content-hash dedup misses this — those memories were near-duplicates not byte-duplicates. The right dedup primitive is mnemon's source_key (post-rc16 canonical identity via store.save's upsert-by-slug) falling back to title for pre-rc16 saves. Dedup key priority: 1. source_key — post-rc16 canonical identity 2. title — lowercased + stripped, catches pre-rc16 dupes 3. id — no-title memories or genuinely unique titles Keep most recent (highest id) per dedup key — has the most current title / confidence / content metadata. Verified against prod snapshot (/tmp/mnemon-prod-snap.sqlite): BEFORE: Live memories: 2084 (3 of top-10 are the same memory) AFTER: Live memories: 1872 (deduped 212 across whole vault) The 3 deploy-changelog entries collapsed to id 644. Auto-selected top-10 now picks 9 more distinct candidates instead of triple-counting the same fact. Full suite: 801 passed. Harness: 13/13. Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/build_standing_set.py | 38 +++++++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/scripts/build_standing_set.py b/scripts/build_standing_set.py index 55fae5b..57f2fd4 100644 --- a/scripts/build_standing_set.py +++ b/scripts/build_standing_set.py @@ -337,7 +337,7 @@ def main() -> int: all_docs = [dict(r) for r in conn.execute( """ - SELECT d.id, d.title, d.content_type, d.confidence, d.hash, c.doc AS content + SELECT d.id, d.title, d.content_type, d.confidence, d.hash, d.source_key, c.doc AS content FROM documents d JOIN content c ON d.hash = c.hash WHERE d.invalidated_at IS NULL @@ -357,10 +357,44 @@ def main() -> int: docs = all_docs n_filtered = n_before - len(docs) + # Dedup near-duplicate iterations of the same memory. Caught + # 2026-05-21 when "System-wide deploy changelog" appeared 3x in + # auto-selected top-10 — same title, content edited iteratively + # across sessions, three different content_hashes. + # + # Dedup key priority: + # 1. source_key (post-rc16 canonical identity from mnemon.store.save) + # 2. title (lowercased, stripped) + # 3. id (untouchable — no-title memories or unique titles) + # + # Keep the most recent (highest id) per dedup key — it has the + # most current title / confidence / content metadata. + by_key: dict[str, dict] = {} + for d in docs: + sk = (d["source_key"] or "").strip() if "source_key" in d.keys() else "" + title = (d["title"] or "").strip().lower() + if sk: + key = f"sk:{sk}" + elif title: + key = f"title:{title}" + else: + # No source_key, no title — never dedup with another memory. + key = f"id:{d['id']}" + existing = by_key.get(key) + if existing is None or d["id"] > existing["id"]: + by_key[key] = d + n_deduped = len(docs) - len(by_key) + docs = list(by_key.values()) + print(f"# Standing-tier scoring (embedding-based, SOTA non-LLM)", file=sys.stderr) print(f"# Vault: {db_path}", file=sys.stderr) print(f"# Vecstore: {vecstore_path}", file=sys.stderr) - print(f"# Live memories: {len(docs)} (filtered {n_filtered} below {min_len}-char threshold)", file=sys.stderr) + print( + f"# Live memories: {len(docs)} " + f"(filtered {n_filtered} below {min_len}-char, " + f"deduped {n_deduped} content-hash duplicates)", + file=sys.stderr, + ) # Embedding-based signals print(f"# Embedding exemplars + memories ...", file=sys.stderr)