From ba6be83b80b1589b1484e92d024314a88d518563 Mon Sep 17 00:00:00 2001 From: Brian McMahon Date: Sun, 17 May 2026 14:21:44 -0700 Subject: [PATCH] fix(cio): reconcile CIO decisions vs candidate set, not raw count The 2026-05-17 Saturday SF research run hard-failed in strict mode: Sonnet's structured-output batch returned 19 decisions for 18 candidates (one stray extra/duplicate decision object). The `len(decisions) != len(candidates)` assertion in run_cio (added 2026-05-02 for the partial-list edge) turned this benign LLM artifact into a failure of the entire weekly pipeline. Replace the raw count check with `_reconcile_cio_decisions`, which validates against the candidate ticker SET: * extraneous (ticker not in candidate set) -> dropped + logged * duplicate -> collapsed conservative-wins (a duplicate can never upgrade a candidate into advancement; ties keep first occurrence) * decisions emitted in candidate order, ticker normalised to the candidate's canonical spelling so _post_process_cio_decisions exact-match stays deterministic across LLM casing/whitespace drift * genuine MISSING candidate after reconciliation -> strict-mode raise (preserves the 2026-05-02 partial-list protection) Strictly stronger than the old check: 18-for-18 with one ticker duplicated (a real candidate silently dropped) PASSED the count check; set reconciliation correctly hard-fails it. Strict-mode message keeps the "N decisions for M candidates" substrings for log/grep continuity. Tests: +6 reconciliation cases (exact 2026-05-17 shape, hallucinated ticker, conservative-wins, casing/whitespace, count-equal-but-missing, first-wins tie) extending test_cio_per_candidate_invariant.py. Full suite 1337 passed; the lone test_scoring RSI failure is a pre-existing stale-local-config artifact (config clone 25 commits behind origin's L1695 #209 revert), unrelated to this change. Co-Authored-By: Claude Opus 4.7 (1M context) --- agents/investment_committee/ic_cio.py | 137 ++++++++++++++++++++-- tests/test_cio_per_candidate_invariant.py | 135 +++++++++++++++++++++ 2 files changed, 260 insertions(+), 12 deletions(-) diff --git a/agents/investment_committee/ic_cio.py b/agents/investment_committee/ic_cio.py index e1140573..4bc51cce 100644 --- a/agents/investment_committee/ic_cio.py +++ b/agents/investment_committee/ic_cio.py @@ -175,23 +175,53 @@ def run_cio( "CIO structured response had empty decisions list" ) return _fallback_selection(candidates, floor) - # Per-candidate invariant: every input candidate must receive a - # decision (ADVANCE / REJECT / NO_ADVANCE_DEADLOCK). Caught - # 2026-05-02 — PR B's strip of the inline JSON example in - # ic_cio_evaluation.txt let Sonnet emit a partial decisions - # list. Prompt fix (config #21) + schema min_length=1 close the - # empty-list edge; this assertion closes the partial-list edge. - if len(decisions_dicts) != len(candidates): + # Per-candidate invariant: every input candidate must appear + # exactly once in the decisions list (ADVANCE / REJECT / + # NO_ADVANCE_DEADLOCK). Reconcile the LLM's decisions against + # the candidate ticker SET rather than asserting a raw count. + # + # Why set reconciliation, not `len(decisions) == len(candidates)`: + # the count check (added 2026-05-02 for the partial-list edge — + # PR B stripped the inline JSON example and Sonnet emitted a + # SHORT list) is brittle in both directions. 2026-05-17 Saturday + # SF: Sonnet's structured-output batch returned 19 decisions for + # 18 candidates (one stray extra/duplicate decision object) — a + # benign LLM artifact the raw count check turned into a hard + # strict-mode failure of the entire weekly run. The count check + # is also too WEAK: 18 decisions for 18 candidates with one + # ticker duplicated (so one real candidate silently missing) + # passed it. Reconciling against the ticker set is strictly + # stronger — it self-heals extraneous/duplicate noise and still + # hard-fails the genuine partial-list regression the original + # assertion protected against. + decisions_dicts, recon = _reconcile_cio_decisions( + decisions_dicts, candidates, + ) + if recon["extraneous"] or recon["duplicate"]: + log.warning( + "[cio] reconciled CIO decisions vs candidate set: dropped " + "%d extraneous %s, collapsed %d duplicated ticker(s) %s " + "(conservative non-ADVANCE-wins); %d candidate decisions " + "retained", + len(recon["extraneous"]), recon["extraneous"], + len(recon["duplicate"]), recon["duplicate"], + len(decisions_dicts), + ) + if recon["missing"]: msg = ( - f"CIO returned {len(decisions_dicts)} decisions for " - f"{len(candidates)} candidates — every candidate must " - f"appear exactly once in the decisions list." + f"CIO returned {recon['raw_count']} decisions for " + f"{len(candidates)} candidates — {len(recon['missing'])} " + f"candidate(s) missing a decision after reconciliation " + f"(dropped {len(recon['extraneous'])} extraneous, " + f"{len(recon['duplicate'])} duplicate): " + f"missing={recon['missing']}. Every candidate must appear " + f"exactly once in the decisions list." ) log.warning("[cio] %s", msg) if is_strict_validation_enabled(): raise RuntimeError(msg) - # Lax mode: fall through to post-process, which tolerates a - # partial list by treating missing tickers as REJECT. + # Lax mode: fall through to post-process, which tolerates the + # still-missing tickers by treating them as REJECT. return _post_process_cio_decisions(decisions_dicts, candidates, floor, cap) except Exception as e: log.error("[cio] evaluation failed: %s", e) @@ -287,6 +317,89 @@ def _combined_score(c: dict) -> float: return (qs + qls) / 2 if qls else qs +def _reconcile_cio_decisions( + decisions: list[dict], candidates: list[dict], +) -> tuple[list[dict], dict]: + """Reconcile the LLM's raw decisions against the candidate ticker SET. + + Replaces the prior brittle ``len(decisions) == len(candidates)`` + assertion. Sonnet's structured-output batch occasionally emits a + stray extra decision object (2026-05-17 SF: 19 decisions for 18 + candidates) or a hallucinated ticker not in the candidate set; the + raw count check turned either benign artifact into a hard failure of + the whole weekly run, while *missing* a duplicate that left a real + candidate uncovered at an equal count. + + Reconciliation rules: + + * **Extraneous** — a decision whose ticker is not in the candidate + set is dropped (the CIO can only rule on what it was given). + * **Duplicate** — multiple decisions for the same candidate are + collapsed to one with *conservative-wins* precedence: a + non-ADVANCE decision beats an ADVANCE one, so a stray duplicate + can never *upgrade* a candidate into advancement. Ties keep the + first occurrence (the LLM's primary ordered judgment). + * **Missing** — a candidate with no surviving decision is reported; + the caller hard-fails in strict mode (this is the genuine + partial-list regression the original assertion guarded). + + The returned decisions are emitted in candidate order with each + decision's ``ticker`` normalised to the candidate's canonical + spelling, so ``_post_process_cio_decisions`` exact-ticker matching + stays deterministic even if the LLM altered casing/whitespace. + + Returns ``(reconciled_decisions, diagnostics)`` where diagnostics + has keys ``raw_count``, ``extraneous``, ``duplicate``, ``missing``. + """ + + def _norm(t) -> str: + return str(t or "").strip().upper() + + # Conservative-wins ranking: higher == more conservative (kept on a + # duplicate clash). Unknown/HOLD treated as mid (never upgrades). + _conservatism = {"ADVANCE": 0, "NO_ADVANCE_DEADLOCK": 1, "REJECT": 2} + + def _rank(dec: dict) -> int: + return _conservatism.get(str(dec.get("decision") or "").upper(), 1) + + canonical: dict[str, str] = {} + for c in candidates: + nt = _norm(c.get("ticker")) + if nt and nt not in canonical: + canonical[nt] = c.get("ticker") + + chosen: dict[str, dict] = {} + extraneous: list[str] = [] + duplicate: list[str] = [] + for d in decisions: + nt = _norm(d.get("ticker")) + if nt not in canonical: + extraneous.append(d.get("ticker")) + continue + d = dict(d) + d["ticker"] = canonical[nt] # normalise to canonical spelling + if nt not in chosen: + chosen[nt] = d + else: + if nt not in duplicate: + duplicate.append(nt) + # Keep the more conservative of the two; tie → keep first. + if _rank(d) > _rank(chosen[nt]): + chosen[nt] = d + + reconciled = [chosen[_norm(c.get("ticker"))] + for c in candidates if _norm(c.get("ticker")) in chosen] + missing = [c.get("ticker") for c in candidates + if _norm(c.get("ticker")) not in chosen] + + return reconciled, { + "raw_count": len(decisions), + "extraneous": extraneous, + "duplicate": duplicate, + "missing": missing, + } + + def _post_process_cio_decisions( decisions: list[dict], candidates: list[dict], diff --git a/tests/test_cio_per_candidate_invariant.py b/tests/test_cio_per_candidate_invariant.py index 90de7f48..e8d2f73d 100644 --- a/tests/test_cio_per_candidate_invariant.py +++ b/tests/test_cio_per_candidate_invariant.py @@ -181,3 +181,138 @@ def test_empty_decisions_handled_before_invariant(self, monkeypatch): # Empty case has its own message — distinct from the # per-candidate count message. Grep-friendly distinction. assert "empty decisions list" in str(exc_info.value).lower() + + +def _raw(pairs: list[tuple[str, str]]): + """Build a ``CIORawOutput`` from explicit (ticker, decision) pairs — + lets a test reproduce duplicate / extraneous / out-of-order shapes + the count-only helper can't express.""" + from graph.state_schemas import CIORawDecision, CIORawOutput + + return CIORawOutput(decisions=[ + CIORawDecision( + ticker=t, decision=d, rank=None, conviction=50, + rationale="synthetic", + ) + for t, d in pairs + ]) + + +def _run(monkeypatch, candidates, raw): + from agents.investment_committee.ic_cio import run_cio + + _patch_llm(monkeypatch, raw) + return run_cio( + candidates=candidates, + macro_context={"market_regime": "neutral"}, + sector_ratings={}, + current_population=[], + open_slots=2, + exits=[], + run_date="2026-05-17", + max_new_entrants=10, + min_new_entrants=2, + ) + + +class TestCIODecisionSetReconciliation: + """The 2026-05-17 Saturday SF failure class: Sonnet's structured + output returned 19 decisions for 18 candidates (one stray extra / + duplicate object). The old raw count check turned this benign LLM + artifact into a hard strict-mode failure of the whole weekly run. + Reconciling against the candidate ticker SET self-heals it while + staying strictly stronger than the count check.""" + + def test_exact_2026_05_17_shape_self_heals(self, monkeypatch): + """19 decisions for 18 candidates where the 19th is a duplicate + of an existing candidate → NO raise; reconciled to 18, one per + candidate; downstream post-processing runs.""" + candidates = [_candidate(f"X{i}") for i in range(18)] + pairs = [(f"X{i}", "REJECT") for i in range(18)] + pairs.append(("X0", "REJECT")) # the stray 19th + result = _run(monkeypatch, candidates, _raw(pairs)) + assert len(result["decisions"]) == 18 + assert {d["ticker"] for d in result["decisions"]} == { + f"X{i}" for i in range(18) + } + + def test_extraneous_hallucinated_ticker_dropped(self, monkeypatch): + """19 decisions for 18 candidates where the 19th is a ticker not + in the candidate set → dropped, no raise, all 18 covered.""" + candidates = [_candidate(f"X{i}") for i in range(18)] + pairs = [(f"X{i}", "REJECT") for i in range(18)] + pairs.append(("ZZZZ", "ADVANCE")) # hallucinated + result = _run(monkeypatch, candidates, _raw(pairs)) + assert "ZZZZ" not in {d["ticker"] for d in result["decisions"]} + assert len(result["decisions"]) == 18 + + def test_duplicate_conservative_wins_never_upgrades(self): + """A duplicate decision can never *upgrade* a candidate into + advancement: ADVANCE then REJECT for the same ticker collapses + to REJECT. Asserted directly on ``_reconcile_cio_decisions`` — + the reconciled decision, not the post-processed one, since + ``_post_process_cio_decisions`` floor-enforcement may legitimately + re-promote a REJECT to ADVANCE_FORCED (a separate concern).""" + from agents.investment_committee.ic_cio import ( + _reconcile_cio_decisions, + ) + + candidates = [_candidate(f"X{i}") for i in range(3)] + decisions = [ + {"ticker": "X0", "decision": "ADVANCE"}, + {"ticker": "X1", "decision": "REJECT"}, + {"ticker": "X2", "decision": "REJECT"}, + {"ticker": "X0", "decision": "REJECT"}, # dup — conservative + ] + reconciled, recon = _reconcile_cio_decisions(decisions, candidates) + assert recon["duplicate"] == ["X0"] + assert recon["missing"] == [] + assert recon["extraneous"] == [] + x0 = next(d for d in reconciled if d["ticker"] == "X0") + assert x0["decision"] == "REJECT" + # Order is candidate order, exactly one per candidate. + assert [d["ticker"] for d in reconciled] == ["X0", "X1", "X2"] + + def test_reconcile_first_wins_on_equal_conservatism(self): + """Two ADVANCE duplicates (equal conservatism) → first wins; + the duplicate is still recorded for the audit log.""" + from agents.investment_committee.ic_cio import ( + _reconcile_cio_decisions, + ) + + candidates = [_candidate("AAA")] + decisions = [ + {"ticker": "AAA", "decision": "ADVANCE", "rationale": "first"}, + {"ticker": "AAA", "decision": "ADVANCE", "rationale": "second"}, + ] + reconciled, recon = _reconcile_cio_decisions(decisions, candidates) + assert len(reconciled) == 1 + assert reconciled[0]["rationale"] == "first" + assert recon["duplicate"] == ["AAA"] + + def test_casing_whitespace_normalised_not_dropped(self, monkeypatch): + """LLM altering ticker casing/whitespace must not orphan a + candidate — normalised match, canonical spelling restored.""" + candidates = [_candidate("AAPL"), _candidate("MSFT")] + result = _run(monkeypatch, candidates, + _raw([(" aapl ", "REJECT"), ("msft", "REJECT")])) + assert {d["ticker"] for d in result["decisions"]} == {"AAPL", "MSFT"} + + def test_count_equal_but_candidate_missing_now_raises(self, monkeypatch): + """Strictly STRONGER than the old check: 18 decisions for 18 + candidates but X0 is duplicated and X17 absent. The old + ``len == len`` check PASSED this (real candidate silently + dropped); set reconciliation correctly hard-fails in strict + mode, naming the missing ticker. Message keeps the + ``N decisions``/``M candidates`` substrings for log/grep + continuity.""" + candidates = [_candidate(f"X{i}") for i in range(18)] + pairs = [(f"X{i}", "REJECT") for i in range(17)] # X0..X16 + pairs.append(("X0", "REJECT")) # dup → count is 18, X17 missing + with pytest.raises(RuntimeError) as exc_info: + _run(monkeypatch, candidates, _raw(pairs)) + msg = str(exc_info.value) + assert "18 decisions" in msg + assert "18 candidates" in msg + assert "missing" in msg.lower() + assert "X17" in msg