From ba6be83b80b1589b1484e92d024314a88d518563 Mon Sep 17 00:00:00 2001
From: Brian McMahon <brian@nousergon.ai>
Date: Sun, 17 May 2026 14:21:44 -0700
Subject: [PATCH] fix(cio): reconcile CIO decisions vs candidate set, not raw
 count

The 2026-05-17 Saturday SF research run hard-failed in strict mode:
Sonnet's structured-output batch returned 19 decisions for 18
candidates (one stray extra/duplicate decision object). The
`len(decisions) != len(candidates)` assertion in run_cio (added
2026-05-02 for the partial-list edge) turned this benign LLM artifact
into a failure of the entire weekly pipeline.

Replace the raw count check with `_reconcile_cio_decisions`, which
validates against the candidate ticker SET:

  * extraneous (ticker not in candidate set) -> dropped + logged
  * duplicate -> collapsed conservative-wins (a duplicate can never
    upgrade a candidate into advancement; ties keep first occurrence)
  * decisions emitted in candidate order, ticker normalised to the
    candidate's canonical spelling so _post_process_cio_decisions
    exact-match stays deterministic across LLM casing/whitespace drift
  * genuine MISSING candidate after reconciliation -> strict-mode
    raise (preserves the 2026-05-02 partial-list protection)

Strictly stronger than the old check: 18-for-18 with one ticker
duplicated (a real candidate silently dropped) PASSED the count check;
set reconciliation correctly hard-fails it. Strict-mode message keeps
the "N decisions for M candidates" substrings for log/grep continuity.

Tests: +6 reconciliation cases (exact 2026-05-17 shape, hallucinated
ticker, conservative-wins, casing/whitespace, count-equal-but-missing,
first-wins tie) extending test_cio_per_candidate_invariant.py. Full
suite 1337 passed; the lone test_scoring RSI failure is a pre-existing
stale-local-config artifact (config clone 25 commits behind origin's
L1695 #209 revert), unrelated to this change.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 agents/investment_committee/ic_cio.py     | 137 ++++++++++++++++++++--
 tests/test_cio_per_candidate_invariant.py | 135 +++++++++++++++++++++
 2 files changed, 260 insertions(+), 12 deletions(-)

diff --git a/agents/investment_committee/ic_cio.py b/agents/investment_committee/ic_cio.py
index e1140573..4bc51cce 100644
--- a/agents/investment_committee/ic_cio.py
+++ b/agents/investment_committee/ic_cio.py
@@ -175,23 +175,53 @@ def run_cio(
                     "CIO structured response had empty decisions list"
                 )
             return _fallback_selection(candidates, floor)
-        # Per-candidate invariant: every input candidate must receive a
-        # decision (ADVANCE / REJECT / NO_ADVANCE_DEADLOCK). Caught
-        # 2026-05-02 — PR B's strip of the inline JSON example in
-        # ic_cio_evaluation.txt let Sonnet emit a partial decisions
-        # list. Prompt fix (config #21) + schema min_length=1 close the
-        # empty-list edge; this assertion closes the partial-list edge.
-        if len(decisions_dicts) != len(candidates):
+        # Per-candidate invariant: every input candidate must appear
+        # exactly once in the decisions list (ADVANCE / REJECT /
+        # NO_ADVANCE_DEADLOCK). Reconcile the LLM's decisions against
+        # the candidate ticker SET rather than asserting a raw count.
+        #
+        # Why set reconciliation, not `len(decisions) == len(candidates)`:
+        # the count check (added 2026-05-02 for the partial-list edge —
+        # PR B stripped the inline JSON example and Sonnet emitted a
+        # SHORT list) is brittle in both directions. 2026-05-17 Saturday
+        # SF: Sonnet's structured-output batch returned 19 decisions for
+        # 18 candidates (one stray extra/duplicate decision object) — a
+        # benign LLM artifact the raw count check turned into a hard
+        # strict-mode failure of the entire weekly run. The count check
+        # is also too WEAK: 18 decisions for 18 candidates with one
+        # ticker duplicated (so one real candidate silently missing)
+        # passed it. Reconciling against the ticker set is strictly
+        # stronger — it self-heals extraneous/duplicate noise and still
+        # hard-fails the genuine partial-list regression the original
+        # assertion protected against.
+        decisions_dicts, recon = _reconcile_cio_decisions(
+            decisions_dicts, candidates,
+        )
+        if recon["extraneous"] or recon["duplicate"]:
+            log.warning(
+                "[cio] reconciled CIO decisions vs candidate set: dropped "
+                "%d extraneous %s, collapsed %d duplicated ticker(s) %s "
+                "(conservative non-ADVANCE-wins); %d candidate decisions "
+                "retained",
+                len(recon["extraneous"]), recon["extraneous"],
+                len(recon["duplicate"]), recon["duplicate"],
+                len(decisions_dicts),
+            )
+        if recon["missing"]:
             msg = (
-                f"CIO returned {len(decisions_dicts)} decisions for "
-                f"{len(candidates)} candidates — every candidate must "
-                f"appear exactly once in the decisions list."
+                f"CIO returned {recon['raw_count']} decisions for "
+                f"{len(candidates)} candidates — {len(recon['missing'])} "
+                f"candidate(s) missing a decision after reconciliation "
+                f"(dropped {len(recon['extraneous'])} extraneous, "
+                f"{len(recon['duplicate'])} duplicate): "
+                f"missing={recon['missing']}. Every candidate must appear "
+                f"exactly once in the decisions list."
             )
             log.warning("[cio] %s", msg)
             if is_strict_validation_enabled():
                 raise RuntimeError(msg)
-            # Lax mode: fall through to post-process, which tolerates a
-            # partial list by treating missing tickers as REJECT.
+            # Lax mode: fall through to post-process, which tolerates the
+            # still-missing tickers by treating them as REJECT.
         return _post_process_cio_decisions(decisions_dicts, candidates, floor, cap)
     except Exception as e:
         log.error("[cio] evaluation failed: %s", e)
@@ -287,6 +317,89 @@ def _combined_score(c: dict) -> float:
     return (qs + qls) / 2 if qls else qs
 
 
+def _reconcile_cio_decisions(
+    decisions: list[dict], candidates: list[dict],
+) -> tuple[list[dict], dict]:
+    """Reconcile the LLM's raw decisions against the candidate ticker SET.
+
+    Replaces the prior brittle ``len(decisions) == len(candidates)``
+    assertion. Sonnet's structured-output batch occasionally emits a
+    stray extra decision object (2026-05-17 SF: 19 decisions for 18
+    candidates) or a hallucinated ticker not in the candidate set; the
+    raw count check turned either benign artifact into a hard failure of
+    the whole weekly run, while *missing* a duplicate that left a real
+    candidate uncovered at an equal count.
+
+    Reconciliation rules:
+
+    * **Extraneous** — a decision whose ticker is not in the candidate
+      set is dropped (the CIO can only rule on what it was given).
+    * **Duplicate** — multiple decisions for the same candidate are
+      collapsed to one with *conservative-wins* precedence: a
+      non-ADVANCE decision beats an ADVANCE one, so a stray duplicate
+      can never *upgrade* a candidate into advancement. Ties keep the
+      first occurrence (the LLM's primary ordered judgment).
+    * **Missing** — a candidate with no surviving decision is reported;
+      the caller hard-fails in strict mode (this is the genuine
+      partial-list regression the original assertion guarded).
+
+    The returned decisions are emitted in candidate order with each
+    decision's ``ticker`` normalised to the candidate's canonical
+    spelling, so ``_post_process_cio_decisions`` exact-ticker matching
+    stays deterministic even if the LLM altered casing/whitespace.
+
+    Returns ``(reconciled_decisions, diagnostics)`` where diagnostics
+    has keys ``raw_count``, ``extraneous``, ``duplicate``, ``missing``.
+    """
+
+    def _norm(t) -> str:
+        return str(t or "").strip().upper()
+
+    # Conservative-wins ranking: higher == more conservative (kept on a
+    # duplicate clash). Unknown/HOLD treated as mid (never upgrades).
+    _conservatism = {"ADVANCE": 0, "NO_ADVANCE_DEADLOCK": 1, "REJECT": 2}
+
+    def _rank(dec: dict) -> int:
+        return _conservatism.get(str(dec.get("decision") or "").upper(), 1)
+
+    canonical: dict[str, str] = {}
+    for c in candidates:
+        nt = _norm(c.get("ticker"))
+        if nt and nt not in canonical:
+            canonical[nt] = c.get("ticker")
+
+    chosen: dict[str, dict] = {}
+    extraneous: list[str] = []
+    duplicate: list[str] = []
+    for d in decisions:
+        nt = _norm(d.get("ticker"))
+        if nt not in canonical:
+            extraneous.append(d.get("ticker"))
+            continue
+        d = dict(d)
+        d["ticker"] = canonical[nt]  # normalise to canonical spelling
+        if nt not in chosen:
+            chosen[nt] = d
+        else:
+            if nt not in duplicate:
+                duplicate.append(nt)
+            # Keep the more conservative of the two; tie → keep first.
+            if _rank(d) > _rank(chosen[nt]):
+                chosen[nt] = d
+
+    reconciled = [chosen[_norm(c.get("ticker"))]
+                  for c in candidates if _norm(c.get("ticker")) in chosen]
+    missing = [c.get("ticker") for c in candidates
+               if _norm(c.get("ticker")) not in chosen]
+
+    return reconciled, {
+        "raw_count": len(decisions),
+        "extraneous": extraneous,
+        "duplicate": duplicate,
+        "missing": missing,
+    }
+
+
 def _post_process_cio_decisions(
     decisions: list[dict],
     candidates: list[dict],
diff --git a/tests/test_cio_per_candidate_invariant.py b/tests/test_cio_per_candidate_invariant.py
index 90de7f48..e8d2f73d 100644
--- a/tests/test_cio_per_candidate_invariant.py
+++ b/tests/test_cio_per_candidate_invariant.py
@@ -181,3 +181,138 @@ def test_empty_decisions_handled_before_invariant(self, monkeypatch):
         # Empty case has its own message — distinct from the
         # per-candidate count message. Grep-friendly distinction.
         assert "empty decisions list" in str(exc_info.value).lower()
+
+
+def _raw(pairs: list[tuple[str, str]]):
+    """Build a ``CIORawOutput`` from explicit (ticker, decision) pairs —
+    lets a test reproduce duplicate / extraneous / out-of-order shapes
+    the count-only helper can't express."""
+    from graph.state_schemas import CIORawDecision, CIORawOutput
+
+    return CIORawOutput(decisions=[
+        CIORawDecision(
+            ticker=t, decision=d, rank=None, conviction=50,
+            rationale="synthetic",
+        )
+        for t, d in pairs
+    ])
+
+
+def _run(monkeypatch, candidates, raw):
+    from agents.investment_committee.ic_cio import run_cio
+
+    _patch_llm(monkeypatch, raw)
+    return run_cio(
+        candidates=candidates,
+        macro_context={"market_regime": "neutral"},
+        sector_ratings={},
+        current_population=[],
+        open_slots=2,
+        exits=[],
+        run_date="2026-05-17",
+        max_new_entrants=10,
+        min_new_entrants=2,
+    )
+
+
+class TestCIODecisionSetReconciliation:
+    """The 2026-05-17 Saturday SF failure class: Sonnet's structured
+    output returned 19 decisions for 18 candidates (one stray extra /
+    duplicate object). The old raw count check turned this benign LLM
+    artifact into a hard strict-mode failure of the whole weekly run.
+    Reconciling against the candidate ticker SET self-heals it while
+    staying strictly stronger than the count check."""
+
+    def test_exact_2026_05_17_shape_self_heals(self, monkeypatch):
+        """19 decisions for 18 candidates where the 19th is a duplicate
+        of an existing candidate → NO raise; reconciled to 18, one per
+        candidate; downstream post-processing runs."""
+        candidates = [_candidate(f"X{i}") for i in range(18)]
+        pairs = [(f"X{i}", "REJECT") for i in range(18)]
+        pairs.append(("X0", "REJECT"))  # the stray 19th
+        result = _run(monkeypatch, candidates, _raw(pairs))
+        assert len(result["decisions"]) == 18
+        assert {d["ticker"] for d in result["decisions"]} == {
+            f"X{i}" for i in range(18)
+        }
+
+    def test_extraneous_hallucinated_ticker_dropped(self, monkeypatch):
+        """19 decisions for 18 candidates where the 19th is a ticker not
+        in the candidate set → dropped, no raise, all 18 covered."""
+        candidates = [_candidate(f"X{i}") for i in range(18)]
+        pairs = [(f"X{i}", "REJECT") for i in range(18)]
+        pairs.append(("ZZZZ", "ADVANCE"))  # hallucinated
+        result = _run(monkeypatch, candidates, _raw(pairs))
+        assert "ZZZZ" not in {d["ticker"] for d in result["decisions"]}
+        assert len(result["decisions"]) == 18
+
+    def test_duplicate_conservative_wins_never_upgrades(self):
+        """A duplicate decision can never *upgrade* a candidate into
+        advancement: ADVANCE then REJECT for the same ticker collapses
+        to REJECT. Asserted directly on ``_reconcile_cio_decisions`` —
+        the reconciled decision, not the post-processed one, since
+        ``_post_process_cio_decisions`` floor-enforcement may legitimately
+        re-promote a REJECT to ADVANCE_FORCED (a separate concern)."""
+        from agents.investment_committee.ic_cio import (
+            _reconcile_cio_decisions,
+        )
+
+        candidates = [_candidate(f"X{i}") for i in range(3)]
+        decisions = [
+            {"ticker": "X0", "decision": "ADVANCE"},
+            {"ticker": "X1", "decision": "REJECT"},
+            {"ticker": "X2", "decision": "REJECT"},
+            {"ticker": "X0", "decision": "REJECT"},  # dup — conservative
+        ]
+        reconciled, recon = _reconcile_cio_decisions(decisions, candidates)
+        assert recon["duplicate"] == ["X0"]
+        assert recon["missing"] == []
+        assert recon["extraneous"] == []
+        x0 = next(d for d in reconciled if d["ticker"] == "X0")
+        assert x0["decision"] == "REJECT"
+        # Order is candidate order, exactly one per candidate.
+        assert [d["ticker"] for d in reconciled] == ["X0", "X1", "X2"]
+
+    def test_reconcile_first_wins_on_equal_conservatism(self):
+        """Two ADVANCE duplicates (equal conservatism) → first wins;
+        the duplicate is still recorded for the audit log."""
+        from agents.investment_committee.ic_cio import (
+            _reconcile_cio_decisions,
+        )
+
+        candidates = [_candidate("AAA")]
+        decisions = [
+            {"ticker": "AAA", "decision": "ADVANCE", "rationale": "first"},
+            {"ticker": "AAA", "decision": "ADVANCE", "rationale": "second"},
+        ]
+        reconciled, recon = _reconcile_cio_decisions(decisions, candidates)
+        assert len(reconciled) == 1
+        assert reconciled[0]["rationale"] == "first"
+        assert recon["duplicate"] == ["AAA"]
+
+    def test_casing_whitespace_normalised_not_dropped(self, monkeypatch):
+        """LLM altering ticker casing/whitespace must not orphan a
+        candidate — normalised match, canonical spelling restored."""
+        candidates = [_candidate("AAPL"), _candidate("MSFT")]
+        result = _run(monkeypatch, candidates,
+                      _raw([(" aapl ", "REJECT"), ("msft", "REJECT")]))
+        assert {d["ticker"] for d in result["decisions"]} == {"AAPL", "MSFT"}
+
+    def test_count_equal_but_candidate_missing_now_raises(self, monkeypatch):
+        """Strictly STRONGER than the old check: 18 decisions for 18
+        candidates but X0 is duplicated and X17 absent. The old
+        ``len == len`` check PASSED this (real candidate silently
+        dropped); set reconciliation correctly hard-fails in strict
+        mode, naming the missing ticker. Message keeps the
+        ``N decisions``/``M candidates`` substrings for log/grep
+        continuity."""
+        candidates = [_candidate(f"X{i}") for i in range(18)]
+        pairs = [(f"X{i}", "REJECT") for i in range(17)]  # X0..X16
+        pairs.append(("X0", "REJECT"))  # dup → count is 18, X17 missing
+        with pytest.raises(RuntimeError) as exc_info:
+            _run(monkeypatch, candidates, _raw(pairs))
+        msg = str(exc_info.value)
+        assert "18 decisions" in msg
+        assert "18 candidates" in msg
+        assert "missing" in msg.lower()
+        assert "X17" in msg