diff --git a/executor/eod_reconcile.py b/executor/eod_reconcile.py index 45b0844..fae1ff8 100644 --- a/executor/eod_reconcile.py +++ b/executor/eod_reconcile.py @@ -18,6 +18,7 @@ import boto3 import pandas as pd import yaml +from pydantic import BaseModel, Field, ValidationError sys.path.insert(0, os.path.dirname(os.path.dirname(__file__))) from executor.eod_emailer import send_eod_email @@ -257,8 +258,54 @@ def _build_position_contexts( return contexts, data_warnings +class _Narrative(BaseModel): + """One per-position rationale.""" + + ticker: str = Field(..., description="Position ticker symbol (e.g. AAPL).") + narrative: str = Field( + ..., + description=( + "2-3 sentences explaining why this position is held today, citing the " + "research thesis, technical signals, and GBM predictions where relevant. " + "If a trade was made today, the narrative also explains why." + ), + ) + + +class _RationalesResponse(BaseModel): + """Tool-use payload for the EOD rationale synthesis call. The Anthropic + SDK validates this shape at the tool-use layer; Pydantic re-validates it + here for type safety + strict-field enforcement. Replaces the legacy + "ask for JSON in the prompt and json.loads the text" pattern that L1248 + / L2669 documented as recurrence-prone (markdown fences, preamble, + trailing text — string-pattern whack-a-mole).""" + + narratives: list[_Narrative] = Field( + ..., + description="One narrative per position in the input list.", + ) + + +_RATIONALES_TOOL = { + "name": "emit_rationales", + "description": ( + "Emit per-position rationales for the EOD report. Call this tool exactly " + "once with the full list — one narrative per input position." + ), + "input_schema": _RationalesResponse.model_json_schema(), +} + + def _synthesize_rationales(contexts: list[dict]) -> dict[str, str]: - """Call Haiku to synthesize per-position narratives. Falls back to templates.""" + """Call Haiku via Anthropic tool-use + Pydantic validation to synthesize + per-position narratives. Falls back to templates on any failure. + + L1248 / L2669: previous implementation read Haiku's freeform text and + tried to ``json.loads`` it — recurrence-prone (markdown fences / + preamble / trailing text). Tool-use makes the parse failure mode + structurally impossible: Haiku returns a typed ``tool_use`` block + whose ``input`` is schema-validated by the SDK *before* it lands here. + """ if not contexts: return {} @@ -272,17 +319,38 @@ def _synthesize_rationales(contexts: list[dict]) -> dict[str, str]: "For each position below, write 2-3 sentences explaining why it is held, " "focusing on near-term catalysts (research thesis, technical signals, GBM predictions). " "If a trade was made today, explain why. Be specific about numbers.\n\n" - "Return valid JSON only: {\"narratives\": [{\"ticker\": \"XXX\", \"narrative\": \"...\"}]}\n\n" + "Call the emit_rationales tool exactly once with one narrative per position.\n\n" f"Positions:\n{json.dumps(contexts, indent=2, default=str)}" ) response = client.messages.create( model="claude-haiku-4-5-20251001", max_tokens=2000, + tools=[_RATIONALES_TOOL], + tool_choice={"type": "tool", "name": "emit_rationales"}, messages=[{"role": "user", "content": prompt}], ) - result = json.loads(response.content[0].text) - return {n["ticker"]: n["narrative"] for n in result.get("narratives", [])} + # tool_choice={"type": "tool", "name": ...} forces Haiku to emit a + # tool_use block — but Anthropic still allows additional text blocks + # alongside it. Pick the tool_use block explicitly. + tool_use = next( + (b for b in response.content if getattr(b, "type", None) == "tool_use"), + None, + ) + if tool_use is None: + raise RuntimeError( + "Haiku response missing the forced emit_rationales tool_use block — " + f"stop_reason={response.stop_reason!r}" + ) + try: + parsed = _RationalesResponse.model_validate(tool_use.input) + except ValidationError as e: + logger.warning( + f"LLM rationale tool_use failed Pydantic validation: {e} — " + f"input={tool_use.input!r}" + ) + raise + return {n.ticker: n.narrative for n in parsed.narratives} except Exception as e: logger.warning(f"LLM rationale synthesis failed: {e} — using template fallback") diff --git a/tests/test_eod_reconcile_logic.py b/tests/test_eod_reconcile_logic.py index 5bbdff0..657889c 100644 --- a/tests/test_eod_reconcile_logic.py +++ b/tests/test_eod_reconcile_logic.py @@ -10,6 +10,8 @@ _apply_dividend_delta, _compute_unattributed_residual_pct, _load_constituents_sector_map, + _Narrative, + _RationalesResponse, _resolve_prior_price, _synthesize_rationales, ) @@ -193,6 +195,137 @@ def test_multiple_tickers(self): assert "MSFT" in result +class TestRationalesResponsePydantic: + """L1248/L2669: Pydantic model that validates the tool-use payload + returned by Haiku. Validation here makes the parse-failure-mode that + bare json.loads used to hit (markdown fences, preamble, trailing + text) structurally impossible — the SDK has already shape-checked + the tool_use.input before we see it; this re-validates field types.""" + + def test_valid_payload(self): + payload = {"narratives": [{"ticker": "AAPL", "narrative": "x" * 50}]} + parsed = _RationalesResponse.model_validate(payload) + assert len(parsed.narratives) == 1 + assert parsed.narratives[0].ticker == "AAPL" + + def test_empty_narratives_list_valid(self): + # The model permits an empty list — Haiku may emit zero narratives if + # the contexts list was empty (the caller short-circuits this earlier, + # but the contract should still accept it). + parsed = _RationalesResponse.model_validate({"narratives": []}) + assert parsed.narratives == [] + + def test_missing_narratives_field_raises(self): + from pydantic import ValidationError as PydValidationError + with pytest.raises(PydValidationError): + _RationalesResponse.model_validate({}) + + def test_narrative_missing_ticker_raises(self): + from pydantic import ValidationError as PydValidationError + with pytest.raises(PydValidationError): + _RationalesResponse.model_validate({"narratives": [{"narrative": "no ticker"}]}) + + def test_narrative_missing_narrative_raises(self): + from pydantic import ValidationError as PydValidationError + with pytest.raises(PydValidationError): + _RationalesResponse.model_validate({"narratives": [{"ticker": "AAPL"}]}) + + def test_narrative_wrong_type_raises(self): + from pydantic import ValidationError as PydValidationError + with pytest.raises(PydValidationError): + _RationalesResponse.model_validate({"narratives": [{"ticker": 123, "narrative": "x"}]}) + + +class TestSynthesizeRationalesToolUse: + """End-to-end coverage of the Anthropic tool-use path. The Anthropic + client is fully mocked so no real API call happens; the assertions + pin (a) the tool/tool_choice wiring (b) Pydantic-validated input + flows through to the returned dict (c) malformed / missing tool_use + blocks fall back to the template path.""" + + def _make_mock_anthropic(self, tool_use_input: dict | None, *, stop_reason: str = "tool_use", include_text_block: bool = False): + """Build a MagicMock anthropic module + client + response chain. + ``tool_use_input=None`` simulates a response with no tool_use block + (degenerate-mode probe). Otherwise the mocked tool_use block carries + ``input=tool_use_input``.""" + mock_anthropic = MagicMock() + mock_client = MagicMock() + mock_anthropic.Anthropic.return_value = mock_client + + blocks = [] + if include_text_block: + text_block = MagicMock() + text_block.type = "text" + text_block.text = "Sure, here are the rationales:" + blocks.append(text_block) + if tool_use_input is not None: + tool_block = MagicMock() + tool_block.type = "tool_use" + tool_block.input = tool_use_input + blocks.append(tool_block) + + mock_response = MagicMock() + mock_response.content = blocks + mock_response.stop_reason = stop_reason + mock_client.messages.create.return_value = mock_response + return mock_anthropic, mock_client + + def test_tool_use_happy_path(self): + mock_anthropic, mock_client = self._make_mock_anthropic( + {"narratives": [ + {"ticker": "AAPL", "narrative": "Held — research score 82, GBM UP."}, + {"ticker": "MSFT", "narrative": "Reduced 5 shares today on profit-take."}, + ]} + ) + contexts = [{"ticker": "AAPL"}, {"ticker": "MSFT"}] + with patch.dict("sys.modules", {"anthropic": mock_anthropic}): + result = _synthesize_rationales(contexts) + assert result == { + "AAPL": "Held — research score 82, GBM UP.", + "MSFT": "Reduced 5 shares today on profit-take.", + } + # Verify the SDK was invoked with the forced tool_choice wiring. + call_kwargs = mock_client.messages.create.call_args.kwargs + assert call_kwargs["tool_choice"] == {"type": "tool", "name": "emit_rationales"} + assert call_kwargs["tools"][0]["name"] == "emit_rationales" + + def test_tool_use_with_preceding_text_block(self): + # Anthropic permits a text block before the tool_use block — the + # synthesizer must pick the tool_use block, not the first content block. + mock_anthropic, _ = self._make_mock_anthropic( + {"narratives": [{"ticker": "GOOG", "narrative": "y" * 40}]}, + include_text_block=True, + ) + with patch.dict("sys.modules", {"anthropic": mock_anthropic}): + result = _synthesize_rationales([{"ticker": "GOOG"}]) + assert result == {"GOOG": "y" * 40} + + def test_missing_tool_use_falls_back_to_template(self): + # Haiku stopped without emitting the forced tool — template fallback. + mock_anthropic, _ = self._make_mock_anthropic(None, stop_reason="end_turn", include_text_block=True) + with patch.dict("sys.modules", {"anthropic": mock_anthropic}): + result = _synthesize_rationales([{"ticker": "AAPL", "research_score": 82.0, "conviction": "rising"}]) + # Template fallback populates from the context — research_score should + # be in the rendered text. + assert "AAPL" in result + assert "82" in result["AAPL"] + assert "rising" in result["AAPL"] + + def test_malformed_tool_input_falls_back_to_template(self): + # tool_use block present but input doesn't match the Pydantic schema. + mock_anthropic, _ = self._make_mock_anthropic({"narratives": [{"ticker": "AAPL"}]}) # missing 'narrative' + with patch.dict("sys.modules", {"anthropic": mock_anthropic}): + result = _synthesize_rationales([{"ticker": "AAPL", "research_score": 90.0}]) + # Template fallback fires; AAPL is still rendered from the context. + assert "AAPL" in result + assert "90" in result["AAPL"] + + def test_empty_contexts_short_circuits(self): + # Empty input never calls the SDK — verify by leaving anthropic + # unmocked; a real import would still resolve but not be invoked. + assert _synthesize_rationales([]) == {} + + class TestLoadConstituentsSectorMap: """Sector enrichment fallback reads latest weekly constituents.json."""