Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
137 changes: 125 additions & 12 deletions agents/investment_committee/ic_cio.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,23 +175,53 @@ def run_cio(
"CIO structured response had empty decisions list"
)
return _fallback_selection(candidates, floor)
# Per-candidate invariant: every input candidate must receive a
# decision (ADVANCE / REJECT / NO_ADVANCE_DEADLOCK). Caught
# 2026-05-02 — PR B's strip of the inline JSON example in
# ic_cio_evaluation.txt let Sonnet emit a partial decisions
# list. Prompt fix (config #21) + schema min_length=1 close the
# empty-list edge; this assertion closes the partial-list edge.
if len(decisions_dicts) != len(candidates):
# Per-candidate invariant: every input candidate must appear
# exactly once in the decisions list (ADVANCE / REJECT /
# NO_ADVANCE_DEADLOCK). Reconcile the LLM's decisions against
# the candidate ticker SET rather than asserting a raw count.
#
# Why set reconciliation, not `len(decisions) == len(candidates)`:
# the count check (added 2026-05-02 for the partial-list edge —
# PR B stripped the inline JSON example and Sonnet emitted a
# SHORT list) is brittle in both directions. 2026-05-17 Saturday
# SF: Sonnet's structured-output batch returned 19 decisions for
# 18 candidates (one stray extra/duplicate decision object) — a
# benign LLM artifact the raw count check turned into a hard
# strict-mode failure of the entire weekly run. The count check
# is also too WEAK: 18 decisions for 18 candidates with one
# ticker duplicated (so one real candidate silently missing)
# passed it. Reconciling against the ticker set is strictly
# stronger — it self-heals extraneous/duplicate noise and still
# hard-fails the genuine partial-list regression the original
# assertion protected against.
decisions_dicts, recon = _reconcile_cio_decisions(
decisions_dicts, candidates,
)
if recon["extraneous"] or recon["duplicate"]:
log.warning(
"[cio] reconciled CIO decisions vs candidate set: dropped "
"%d extraneous %s, collapsed %d duplicated ticker(s) %s "
"(conservative non-ADVANCE-wins); %d candidate decisions "
"retained",
len(recon["extraneous"]), recon["extraneous"],
len(recon["duplicate"]), recon["duplicate"],
len(decisions_dicts),
)
if recon["missing"]:
msg = (
f"CIO returned {len(decisions_dicts)} decisions for "
f"{len(candidates)} candidates — every candidate must "
f"appear exactly once in the decisions list."
f"CIO returned {recon['raw_count']} decisions for "
f"{len(candidates)} candidates — {len(recon['missing'])} "
f"candidate(s) missing a decision after reconciliation "
f"(dropped {len(recon['extraneous'])} extraneous, "
f"{len(recon['duplicate'])} duplicate): "
f"missing={recon['missing']}. Every candidate must appear "
f"exactly once in the decisions list."
)
log.warning("[cio] %s", msg)
if is_strict_validation_enabled():
raise RuntimeError(msg)
# Lax mode: fall through to post-process, which tolerates a
# partial list by treating missing tickers as REJECT.
# Lax mode: fall through to post-process, which tolerates the
# still-missing tickers by treating them as REJECT.
return _post_process_cio_decisions(decisions_dicts, candidates, floor, cap)
except Exception as e:
log.error("[cio] evaluation failed: %s", e)
Expand Down Expand Up @@ -287,6 +317,89 @@ def _combined_score(c: dict) -> float:
return (qs + qls) / 2 if qls else qs


def _reconcile_cio_decisions(
decisions: list[dict], candidates: list[dict],
) -> tuple[list[dict], dict]:
"""Reconcile the LLM's raw decisions against the candidate ticker SET.

Replaces the prior brittle ``len(decisions) == len(candidates)``
assertion. Sonnet's structured-output batch occasionally emits a
stray extra decision object (2026-05-17 SF: 19 decisions for 18
candidates) or a hallucinated ticker not in the candidate set; the
raw count check turned either benign artifact into a hard failure of
the whole weekly run, while *missing* a duplicate that left a real
candidate uncovered at an equal count.

Reconciliation rules:

* **Extraneous** — a decision whose ticker is not in the candidate
set is dropped (the CIO can only rule on what it was given).
* **Duplicate** — multiple decisions for the same candidate are
collapsed to one with *conservative-wins* precedence: a
non-ADVANCE decision beats an ADVANCE one, so a stray duplicate
can never *upgrade* a candidate into advancement. Ties keep the
first occurrence (the LLM's primary ordered judgment).
* **Missing** — a candidate with no surviving decision is reported;
the caller hard-fails in strict mode (this is the genuine
partial-list regression the original assertion guarded).

The returned decisions are emitted in candidate order with each
decision's ``ticker`` normalised to the candidate's canonical
spelling, so ``_post_process_cio_decisions`` exact-ticker matching
stays deterministic even if the LLM altered casing/whitespace.

Returns ``(reconciled_decisions, diagnostics)`` where diagnostics
has keys ``raw_count``, ``extraneous``, ``duplicate``, ``missing``.
"""

def _norm(t) -> str:
return str(t or "").strip().upper()

# Conservative-wins ranking: higher == more conservative (kept on a
# duplicate clash). Unknown/HOLD treated as mid (never upgrades).
_conservatism = {"ADVANCE": 0, "NO_ADVANCE_DEADLOCK": 1, "REJECT": 2}

def _rank(dec: dict) -> int:
return _conservatism.get(str(dec.get("decision") or "").upper(), 1)

canonical: dict[str, str] = {}
for c in candidates:
nt = _norm(c.get("ticker"))
if nt and nt not in canonical:
canonical[nt] = c.get("ticker")

chosen: dict[str, dict] = {}
extraneous: list[str] = []
duplicate: list[str] = []
for d in decisions:
nt = _norm(d.get("ticker"))
if nt not in canonical:
extraneous.append(d.get("ticker"))
continue
d = dict(d)
d["ticker"] = canonical[nt] # normalise to canonical spelling
if nt not in chosen:
chosen[nt] = d
else:
if nt not in duplicate:
duplicate.append(nt)
# Keep the more conservative of the two; tie → keep first.
if _rank(d) > _rank(chosen[nt]):
chosen[nt] = d

reconciled = [chosen[_norm(c.get("ticker"))]
for c in candidates if _norm(c.get("ticker")) in chosen]
missing = [c.get("ticker") for c in candidates
if _norm(c.get("ticker")) not in chosen]

return reconciled, {
"raw_count": len(decisions),
"extraneous": extraneous,
"duplicate": duplicate,
"missing": missing,
}


def _post_process_cio_decisions(
decisions: list[dict],
candidates: list[dict],
Expand Down
135 changes: 135 additions & 0 deletions tests/test_cio_per_candidate_invariant.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,3 +181,138 @@ def test_empty_decisions_handled_before_invariant(self, monkeypatch):
# Empty case has its own message — distinct from the
# per-candidate count message. Grep-friendly distinction.
assert "empty decisions list" in str(exc_info.value).lower()


def _raw(pairs: list[tuple[str, str]]):
"""Build a ``CIORawOutput`` from explicit (ticker, decision) pairs —
lets a test reproduce duplicate / extraneous / out-of-order shapes
the count-only helper can't express."""
from graph.state_schemas import CIORawDecision, CIORawOutput

return CIORawOutput(decisions=[
CIORawDecision(
ticker=t, decision=d, rank=None, conviction=50,
rationale="synthetic",
)
for t, d in pairs
])


def _run(monkeypatch, candidates, raw):
from agents.investment_committee.ic_cio import run_cio

_patch_llm(monkeypatch, raw)
return run_cio(
candidates=candidates,
macro_context={"market_regime": "neutral"},
sector_ratings={},
current_population=[],
open_slots=2,
exits=[],
run_date="2026-05-17",
max_new_entrants=10,
min_new_entrants=2,
)


class TestCIODecisionSetReconciliation:
"""The 2026-05-17 Saturday SF failure class: Sonnet's structured
output returned 19 decisions for 18 candidates (one stray extra /
duplicate object). The old raw count check turned this benign LLM
artifact into a hard strict-mode failure of the whole weekly run.
Reconciling against the candidate ticker SET self-heals it while
staying strictly stronger than the count check."""

def test_exact_2026_05_17_shape_self_heals(self, monkeypatch):
"""19 decisions for 18 candidates where the 19th is a duplicate
of an existing candidate → NO raise; reconciled to 18, one per
candidate; downstream post-processing runs."""
candidates = [_candidate(f"X{i}") for i in range(18)]
pairs = [(f"X{i}", "REJECT") for i in range(18)]
pairs.append(("X0", "REJECT")) # the stray 19th
result = _run(monkeypatch, candidates, _raw(pairs))
assert len(result["decisions"]) == 18
assert {d["ticker"] for d in result["decisions"]} == {
f"X{i}" for i in range(18)
}

def test_extraneous_hallucinated_ticker_dropped(self, monkeypatch):
"""19 decisions for 18 candidates where the 19th is a ticker not
in the candidate set → dropped, no raise, all 18 covered."""
candidates = [_candidate(f"X{i}") for i in range(18)]
pairs = [(f"X{i}", "REJECT") for i in range(18)]
pairs.append(("ZZZZ", "ADVANCE")) # hallucinated
result = _run(monkeypatch, candidates, _raw(pairs))
assert "ZZZZ" not in {d["ticker"] for d in result["decisions"]}
assert len(result["decisions"]) == 18

def test_duplicate_conservative_wins_never_upgrades(self):
"""A duplicate decision can never *upgrade* a candidate into
advancement: ADVANCE then REJECT for the same ticker collapses
to REJECT. Asserted directly on ``_reconcile_cio_decisions`` —
the reconciled decision, not the post-processed one, since
``_post_process_cio_decisions`` floor-enforcement may legitimately
re-promote a REJECT to ADVANCE_FORCED (a separate concern)."""
from agents.investment_committee.ic_cio import (
_reconcile_cio_decisions,
)

candidates = [_candidate(f"X{i}") for i in range(3)]
decisions = [
{"ticker": "X0", "decision": "ADVANCE"},
{"ticker": "X1", "decision": "REJECT"},
{"ticker": "X2", "decision": "REJECT"},
{"ticker": "X0", "decision": "REJECT"}, # dup — conservative
]
reconciled, recon = _reconcile_cio_decisions(decisions, candidates)
assert recon["duplicate"] == ["X0"]
assert recon["missing"] == []
assert recon["extraneous"] == []
x0 = next(d for d in reconciled if d["ticker"] == "X0")
assert x0["decision"] == "REJECT"
# Order is candidate order, exactly one per candidate.
assert [d["ticker"] for d in reconciled] == ["X0", "X1", "X2"]

def test_reconcile_first_wins_on_equal_conservatism(self):
"""Two ADVANCE duplicates (equal conservatism) → first wins;
the duplicate is still recorded for the audit log."""
from agents.investment_committee.ic_cio import (
_reconcile_cio_decisions,
)

candidates = [_candidate("AAA")]
decisions = [
{"ticker": "AAA", "decision": "ADVANCE", "rationale": "first"},
{"ticker": "AAA", "decision": "ADVANCE", "rationale": "second"},
]
reconciled, recon = _reconcile_cio_decisions(decisions, candidates)
assert len(reconciled) == 1
assert reconciled[0]["rationale"] == "first"
assert recon["duplicate"] == ["AAA"]

def test_casing_whitespace_normalised_not_dropped(self, monkeypatch):
"""LLM altering ticker casing/whitespace must not orphan a
candidate — normalised match, canonical spelling restored."""
candidates = [_candidate("AAPL"), _candidate("MSFT")]
result = _run(monkeypatch, candidates,
_raw([(" aapl ", "REJECT"), ("msft", "REJECT")]))
assert {d["ticker"] for d in result["decisions"]} == {"AAPL", "MSFT"}

def test_count_equal_but_candidate_missing_now_raises(self, monkeypatch):
"""Strictly STRONGER than the old check: 18 decisions for 18
candidates but X0 is duplicated and X17 absent. The old
``len == len`` check PASSED this (real candidate silently
dropped); set reconciliation correctly hard-fails in strict
mode, naming the missing ticker. Message keeps the
``N decisions``/``M candidates`` substrings for log/grep
continuity."""
candidates = [_candidate(f"X{i}") for i in range(18)]
pairs = [(f"X{i}", "REJECT") for i in range(17)] # X0..X16
pairs.append(("X0", "REJECT")) # dup → count is 18, X17 missing
with pytest.raises(RuntimeError) as exc_info:
_run(monkeypatch, candidates, _raw(pairs))
msg = str(exc_info.value)
assert "18 decisions" in msg
assert "18 candidates" in msg
assert "missing" in msg.lower()
assert "X17" in msg
Loading