From 162b17dd6955406a78e65374513c6d2da27b2c41 Mon Sep 17 00:00:00 2001
From: Brian McMahon <brian@nousergon.ai>
Date: Thu, 21 May 2026 17:21:15 -0700
Subject: [PATCH] feat(salience): dedup near-duplicate memories by source_key /
 title
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Caught 2026-05-21 on first real prod-vault scoring output: 3 of the
auto-selected top-10 were the same memory ("System-wide deploy
changelog" at ids 600, 617, 644 — same title, content edited
iteratively across sessions, three different content_hashes).

Bare content-hash dedup misses this — those memories were
near-duplicates not byte-duplicates. The right dedup primitive is
mnemon's source_key (post-rc16 canonical identity via
store.save's upsert-by-slug) falling back to title for pre-rc16
saves.

Dedup key priority:
  1. source_key   — post-rc16 canonical identity
  2. title        — lowercased + stripped, catches pre-rc16 dupes
  3. id           — no-title memories or genuinely unique titles

Keep most recent (highest id) per dedup key — has the most current
title / confidence / content metadata.

Verified against prod snapshot (/tmp/mnemon-prod-snap.sqlite):
  BEFORE: Live memories: 2084 (3 of top-10 are the same memory)
  AFTER:  Live memories: 1872 (deduped 212 across whole vault)

The 3 deploy-changelog entries collapsed to id 644. Auto-selected
top-10 now picks 9 more distinct candidates instead of triple-counting
the same fact.

Full suite: 801 passed. Harness: 13/13.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 scripts/build_standing_set.py | 38 +++++++++++++++++++++++++++++++++--
 1 file changed, 36 insertions(+), 2 deletions(-)

diff --git a/scripts/build_standing_set.py b/scripts/build_standing_set.py
index 55fae5b..57f2fd4 100644
--- a/scripts/build_standing_set.py
+++ b/scripts/build_standing_set.py
@@ -337,7 +337,7 @@ def main() -> int:
 
     all_docs = [dict(r) for r in conn.execute(
         """
-        SELECT d.id, d.title, d.content_type, d.confidence, d.hash, c.doc AS content
+        SELECT d.id, d.title, d.content_type, d.confidence, d.hash, d.source_key, c.doc AS content
         FROM documents d
         JOIN content c ON d.hash = c.hash
         WHERE d.invalidated_at IS NULL
@@ -357,10 +357,44 @@ def main() -> int:
         docs = all_docs
     n_filtered = n_before - len(docs)
 
+    # Dedup near-duplicate iterations of the same memory. Caught
+    # 2026-05-21 when "System-wide deploy changelog" appeared 3x in
+    # auto-selected top-10 — same title, content edited iteratively
+    # across sessions, three different content_hashes.
+    #
+    # Dedup key priority:
+    #   1. source_key (post-rc16 canonical identity from mnemon.store.save)
+    #   2. title (lowercased, stripped)
+    #   3. id (untouchable — no-title memories or unique titles)
+    #
+    # Keep the most recent (highest id) per dedup key — it has the
+    # most current title / confidence / content metadata.
+    by_key: dict[str, dict] = {}
+    for d in docs:
+        sk = (d["source_key"] or "").strip() if "source_key" in d.keys() else ""
+        title = (d["title"] or "").strip().lower()
+        if sk:
+            key = f"sk:{sk}"
+        elif title:
+            key = f"title:{title}"
+        else:
+            # No source_key, no title — never dedup with another memory.
+            key = f"id:{d['id']}"
+        existing = by_key.get(key)
+        if existing is None or d["id"] > existing["id"]:
+            by_key[key] = d
+    n_deduped = len(docs) - len(by_key)
+    docs = list(by_key.values())
+
     print(f"# Standing-tier scoring (embedding-based, SOTA non-LLM)", file=sys.stderr)
     print(f"# Vault:    {db_path}", file=sys.stderr)
     print(f"# Vecstore: {vecstore_path}", file=sys.stderr)
-    print(f"# Live memories: {len(docs)} (filtered {n_filtered} below {min_len}-char threshold)", file=sys.stderr)
+    print(
+        f"# Live memories: {len(docs)} "
+        f"(filtered {n_filtered} below {min_len}-char, "
+        f"deduped {n_deduped} content-hash duplicates)",
+        file=sys.stderr,
+    )
 
     # Embedding-based signals
     print(f"# Embedding exemplars + memories ...", file=sys.stderr)