From 717ebcf580509916d2ffd58beaf225eb33a8a70e Mon Sep 17 00:00:00 2001 From: Brian McMahon Date: Thu, 21 May 2026 17:02:41 -0700 Subject: [PATCH] feat(salience): length filter + tuned exemplars MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two iterations on the embedding-based scorer based on first real prod-vault scoring output (PR #149 will get embeddings into the snapshot first; this PR makes the scoring itself more robust): 1. --min-content-length filter (default 50 chars). Hard-filter tiny memories like "halt the run" / "propagate" / "Option 1 or 3" BEFORE scoring. They technically score via breadth (FTS-match many queries) but carry no actual standing-tier constraint — too thin to condition reasoning. Operator can set to 0 to disable, or any other threshold. Rationale for hard filter (not soft penalty): no amount of other signal saves a 2-word memory from being noise in a standing-tier context. Soft penalty would still let edge cases through; hard filter is robust. 2. Tuned exemplar lists. CONSTRAINT_EXEMPLARS expanded from 10 to 21 patterns drawn from real Brian-coded standing rules observed during this session: - SOTA / institutional rules (5) - Verification / discipline (5) - Failure / error handling (4) - Process / coordination (3) - Existential constraints (3) TIME_BOUNDED_EXEMPLARS expanded from 10 to 17 patterns. New coverage includes: - Session handoff shapes (5): the dominant noise pattern in bare-prod-snapshot scoring (PR #149 output) was tiny session handoffs like "Session: proceed". These need explicit negative-exemplar coverage. - PR / commit references (2) - Tiny single-thought patterns (3): "halt the run", "propagate", "Option 1 or 3" — defense in depth with the length filter. Verified locally: smoke against 4-doc vault, length filter drops 1 sub-50-char doc; scoring still discriminates. Full pytest 801, harness 13/13. Composes with PR #149 — once both merge: scripts/salience_phase0.sh snapshot # pulls vec.npz alongside sqlite scripts/salience_phase0.sh score # tuned exemplars + length filter Expected: scoring against prod will produce meaningfully better picks. Constraint-shape memories should rise, session-handoff noise should be penalized (high time_penalty), and sub-50-char single-thought memories should be filtered out entirely. Future iteration paths (NOT in this PR): - Vault-derived auto-exemplars (sample high-confidence feedback/ preference memories as positive exemplars instead of hand- tuning). True prototype-network design. ~1h work. - Per-content-type length thresholds (handoffs often have meta headers like "- Topic: X" that inflate length without adding constraint content). Operator-tunable. Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/build_standing_set.py | 70 ++++++++++++++++++++++++++++++----- 1 file changed, 60 insertions(+), 10 deletions(-) diff --git a/scripts/build_standing_set.py b/scripts/build_standing_set.py index df5b293..55fae5b 100644 --- a/scripts/build_standing_set.py +++ b/scripts/build_standing_set.py @@ -82,29 +82,57 @@ # rule-like, anchoring to date/event markers means it's not durable. CONSTRAINT_EXEMPLARS = [ - "default to the institutional approach with no shortcuts", + # SOTA / institutional rules + "default to the SOTA / institutional approach, no shortcuts", + "use the right primitive, not the smaller diff", + "the most-correct, most-robust route is also the faster path to production", + "only deviate from SOTA with an explicit written rationale", + "lift to library when two or more consumers exist", + # Verification / discipline "always verify before promoting to production", - "never argmax-route to a per-regime sub-model", + "audit before scoping, the codebase is the source of truth", + "every audit finding becomes a ROADMAP follow-up item", + "verify branch state before running apply.sh", + "test reproduction before shipping the static root cause", + # Failure / error handling "fail loud and fast on errors, no silent swallows", - "prefer the most correct path over the smaller diff", + "any swallow must carry an inline comment naming the failure mode", + "raise so the failure surfaces at the earliest possible callsite", + "graceful degrade is forbidden on producer / writer paths", + # Process / coordination + "every PR deploy appends to the system-wide changelog automatically", + "never argmax-route to a per-regime sub-model", + "use canonical alpha labels with explicit clipping over raw arithmetic returns", + # Existential constraints "runway is not a constraint, optimize for preference not necessity", - "use the SOTA primitive when it exists, do not ship a workaround", - "the most robust route is also the faster path to production", "X is not Y — assert the constraint explicitly", - "audit before scoping, the codebase is the source of truth", + "this fact conditions reasoning regardless of query similarity", ] TIME_BOUNDED_EXEMPLARS = [ + # Session handoffs (these consistently get caught by FTS-breadth noise) + "Session: proceed with option A", + "Session: pr merged", + "Session: ok so you are telling me", + "Session: i was wondering", + "Session: i think it may be best", + # Status updates with date markers "today's saturday SF run completed successfully", "yesterday's deploy of PR was merged to main", "shipped this week as part of the rc18 release", "tomorrow's market open at 6:30 AM PT", "the 2026-05-21 incident response writeup", - "session handoff from the Tuesday evening session", "scheduled for Wednesday afternoon trigger", "merged commit abc123 to main on Friday", "this morning's MorningEnrich step in the weekday SF", "post-market reconciliation completed for May 21", + # PR / commit specific + "PR #143 merged at 22:17:17Z", + "v0.6.0 tag pushed to origin", + # Tiny single-thought memories (no constraint, no context) + "halt the run", + "propagate", + "Option 1 or 3", ] # Operator-tunable correction patterns (existing heuristic, kept). @@ -123,6 +151,14 @@ DEFAULT_TOP_N = 10 HARD_CEILING = 20 # plan invariant: never exceed 20 +# Minimum content length for standing-tier consideration. A 2-word +# memory like "halt the run" or "propagate" technically scores via +# breadth (FTS-matches many queries) but carries no actual constraint +# — too thin to condition reasoning. Hard-filter rather than soft-penalty +# because no amount of other signal saves a 2-word memory from being +# noise in a standing-tier context. +DEFAULT_MIN_CONTENT_LENGTH = 50 + def _resolve_db(vault_override: str | None, db_override: str | None) -> Path: if db_override: @@ -277,6 +313,11 @@ def main() -> int: help="print candidates but don't write standing.json / standing-rendered.md") ap.add_argument("--show", type=int, default=30, help="how many top-scored candidates to display (default: 30)") + ap.add_argument("--min-content-length", type=int, default=DEFAULT_MIN_CONTENT_LENGTH, + help=f"drop memories whose content is shorter than N chars BEFORE scoring " + f"(default: {DEFAULT_MIN_CONTENT_LENGTH}; set to 0 to disable). " + f"Filters out tiny noise memories like 'halt the run' / 'propagate' that " + f"FTS-match many queries but carry no actual constraint.") args = ap.parse_args() if args.top > HARD_CEILING: @@ -294,7 +335,7 @@ def main() -> int: conn = sqlite3.connect(str(db_path)) conn.row_factory = sqlite3.Row - docs = [dict(r) for r in conn.execute( + all_docs = [dict(r) for r in conn.execute( """ SELECT d.id, d.title, d.content_type, d.confidence, d.hash, c.doc AS content FROM documents d @@ -303,14 +344,23 @@ def main() -> int: """ ).fetchall()] - if not docs: + if not all_docs: print(f"vault {db_path} has no live memories", file=sys.stderr) return 1 + # Length filter — drop tiny noise BEFORE scoring. + n_before = len(all_docs) + min_len = max(0, args.min_content_length) + if min_len > 0: + docs = [d for d in all_docs if len(d["content"] or "") >= min_len] + else: + docs = all_docs + n_filtered = n_before - len(docs) + print(f"# Standing-tier scoring (embedding-based, SOTA non-LLM)", file=sys.stderr) print(f"# Vault: {db_path}", file=sys.stderr) print(f"# Vecstore: {vecstore_path}", file=sys.stderr) - print(f"# Live memories: {len(docs)}", file=sys.stderr) + print(f"# Live memories: {len(docs)} (filtered {n_filtered} below {min_len}-char threshold)", file=sys.stderr) # Embedding-based signals print(f"# Embedding exemplars + memories ...", file=sys.stderr)