Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 72 additions & 4 deletions executor/eod_reconcile.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import boto3
import pandas as pd
import yaml
from pydantic import BaseModel, Field, ValidationError
sys.path.insert(0, os.path.dirname(os.path.dirname(__file__)))

from executor.eod_emailer import send_eod_email
Expand Down Expand Up @@ -257,8 +258,54 @@ def _build_position_contexts(
return contexts, data_warnings


class _Narrative(BaseModel):
"""One per-position rationale."""

ticker: str = Field(..., description="Position ticker symbol (e.g. AAPL).")
narrative: str = Field(
...,
description=(
"2-3 sentences explaining why this position is held today, citing the "
"research thesis, technical signals, and GBM predictions where relevant. "
"If a trade was made today, the narrative also explains why."
),
)


class _RationalesResponse(BaseModel):
"""Tool-use payload for the EOD rationale synthesis call. The Anthropic
SDK validates this shape at the tool-use layer; Pydantic re-validates it
here for type safety + strict-field enforcement. Replaces the legacy
"ask for JSON in the prompt and json.loads the text" pattern that L1248
/ L2669 documented as recurrence-prone (markdown fences, preamble,
trailing text — string-pattern whack-a-mole)."""

narratives: list[_Narrative] = Field(
...,
description="One narrative per position in the input list.",
)


_RATIONALES_TOOL = {
"name": "emit_rationales",
"description": (
"Emit per-position rationales for the EOD report. Call this tool exactly "
"once with the full list — one narrative per input position."
),
"input_schema": _RationalesResponse.model_json_schema(),
}


def _synthesize_rationales(contexts: list[dict]) -> dict[str, str]:
"""Call Haiku to synthesize per-position narratives. Falls back to templates."""
"""Call Haiku via Anthropic tool-use + Pydantic validation to synthesize
per-position narratives. Falls back to templates on any failure.

L1248 / L2669: previous implementation read Haiku's freeform text and
tried to ``json.loads`` it — recurrence-prone (markdown fences /
preamble / trailing text). Tool-use makes the parse failure mode
structurally impossible: Haiku returns a typed ``tool_use`` block
whose ``input`` is schema-validated by the SDK *before* it lands here.
"""
if not contexts:
return {}

Expand All @@ -272,17 +319,38 @@ def _synthesize_rationales(contexts: list[dict]) -> dict[str, str]:
"For each position below, write 2-3 sentences explaining why it is held, "
"focusing on near-term catalysts (research thesis, technical signals, GBM predictions). "
"If a trade was made today, explain why. Be specific about numbers.\n\n"
"Return valid JSON only: {\"narratives\": [{\"ticker\": \"XXX\", \"narrative\": \"...\"}]}\n\n"
"Call the emit_rationales tool exactly once with one narrative per position.\n\n"
f"Positions:\n{json.dumps(contexts, indent=2, default=str)}"
)

response = client.messages.create(
model="claude-haiku-4-5-20251001",
max_tokens=2000,
tools=[_RATIONALES_TOOL],
tool_choice={"type": "tool", "name": "emit_rationales"},
messages=[{"role": "user", "content": prompt}],
)
result = json.loads(response.content[0].text)
return {n["ticker"]: n["narrative"] for n in result.get("narratives", [])}
# tool_choice={"type": "tool", "name": ...} forces Haiku to emit a
# tool_use block — but Anthropic still allows additional text blocks
# alongside it. Pick the tool_use block explicitly.
tool_use = next(
(b for b in response.content if getattr(b, "type", None) == "tool_use"),
None,
)
if tool_use is None:
raise RuntimeError(
"Haiku response missing the forced emit_rationales tool_use block — "
f"stop_reason={response.stop_reason!r}"
)
try:
parsed = _RationalesResponse.model_validate(tool_use.input)
except ValidationError as e:
logger.warning(
f"LLM rationale tool_use failed Pydantic validation: {e} — "
f"input={tool_use.input!r}"
)
raise
return {n.ticker: n.narrative for n in parsed.narratives}
except Exception as e:
logger.warning(f"LLM rationale synthesis failed: {e} — using template fallback")

Expand Down
133 changes: 133 additions & 0 deletions tests/test_eod_reconcile_logic.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
_apply_dividend_delta,
_compute_unattributed_residual_pct,
_load_constituents_sector_map,
_Narrative,
_RationalesResponse,
_resolve_prior_price,
_synthesize_rationales,
)
Expand Down Expand Up @@ -193,6 +195,137 @@ def test_multiple_tickers(self):
assert "MSFT" in result


class TestRationalesResponsePydantic:
"""L1248/L2669: Pydantic model that validates the tool-use payload
returned by Haiku. Validation here makes the parse-failure-mode that
bare json.loads used to hit (markdown fences, preamble, trailing
text) structurally impossible — the SDK has already shape-checked
the tool_use.input before we see it; this re-validates field types."""

def test_valid_payload(self):
payload = {"narratives": [{"ticker": "AAPL", "narrative": "x" * 50}]}
parsed = _RationalesResponse.model_validate(payload)
assert len(parsed.narratives) == 1
assert parsed.narratives[0].ticker == "AAPL"

def test_empty_narratives_list_valid(self):
# The model permits an empty list — Haiku may emit zero narratives if
# the contexts list was empty (the caller short-circuits this earlier,
# but the contract should still accept it).
parsed = _RationalesResponse.model_validate({"narratives": []})
assert parsed.narratives == []

def test_missing_narratives_field_raises(self):
from pydantic import ValidationError as PydValidationError
with pytest.raises(PydValidationError):
_RationalesResponse.model_validate({})

def test_narrative_missing_ticker_raises(self):
from pydantic import ValidationError as PydValidationError
with pytest.raises(PydValidationError):
_RationalesResponse.model_validate({"narratives": [{"narrative": "no ticker"}]})

def test_narrative_missing_narrative_raises(self):
from pydantic import ValidationError as PydValidationError
with pytest.raises(PydValidationError):
_RationalesResponse.model_validate({"narratives": [{"ticker": "AAPL"}]})

def test_narrative_wrong_type_raises(self):
from pydantic import ValidationError as PydValidationError
with pytest.raises(PydValidationError):
_RationalesResponse.model_validate({"narratives": [{"ticker": 123, "narrative": "x"}]})


class TestSynthesizeRationalesToolUse:
"""End-to-end coverage of the Anthropic tool-use path. The Anthropic
client is fully mocked so no real API call happens; the assertions
pin (a) the tool/tool_choice wiring (b) Pydantic-validated input
flows through to the returned dict (c) malformed / missing tool_use
blocks fall back to the template path."""

def _make_mock_anthropic(self, tool_use_input: dict | None, *, stop_reason: str = "tool_use", include_text_block: bool = False):
"""Build a MagicMock anthropic module + client + response chain.
``tool_use_input=None`` simulates a response with no tool_use block
(degenerate-mode probe). Otherwise the mocked tool_use block carries
``input=tool_use_input``."""
mock_anthropic = MagicMock()
mock_client = MagicMock()
mock_anthropic.Anthropic.return_value = mock_client

blocks = []
if include_text_block:
text_block = MagicMock()
text_block.type = "text"
text_block.text = "Sure, here are the rationales:"
blocks.append(text_block)
if tool_use_input is not None:
tool_block = MagicMock()
tool_block.type = "tool_use"
tool_block.input = tool_use_input
blocks.append(tool_block)

mock_response = MagicMock()
mock_response.content = blocks
mock_response.stop_reason = stop_reason
mock_client.messages.create.return_value = mock_response
return mock_anthropic, mock_client

def test_tool_use_happy_path(self):
mock_anthropic, mock_client = self._make_mock_anthropic(
{"narratives": [
{"ticker": "AAPL", "narrative": "Held — research score 82, GBM UP."},
{"ticker": "MSFT", "narrative": "Reduced 5 shares today on profit-take."},
]}
)
contexts = [{"ticker": "AAPL"}, {"ticker": "MSFT"}]
with patch.dict("sys.modules", {"anthropic": mock_anthropic}):
result = _synthesize_rationales(contexts)
assert result == {
"AAPL": "Held — research score 82, GBM UP.",
"MSFT": "Reduced 5 shares today on profit-take.",
}
# Verify the SDK was invoked with the forced tool_choice wiring.
call_kwargs = mock_client.messages.create.call_args.kwargs
assert call_kwargs["tool_choice"] == {"type": "tool", "name": "emit_rationales"}
assert call_kwargs["tools"][0]["name"] == "emit_rationales"

def test_tool_use_with_preceding_text_block(self):
# Anthropic permits a text block before the tool_use block — the
# synthesizer must pick the tool_use block, not the first content block.
mock_anthropic, _ = self._make_mock_anthropic(
{"narratives": [{"ticker": "GOOG", "narrative": "y" * 40}]},
include_text_block=True,
)
with patch.dict("sys.modules", {"anthropic": mock_anthropic}):
result = _synthesize_rationales([{"ticker": "GOOG"}])
assert result == {"GOOG": "y" * 40}

def test_missing_tool_use_falls_back_to_template(self):
# Haiku stopped without emitting the forced tool — template fallback.
mock_anthropic, _ = self._make_mock_anthropic(None, stop_reason="end_turn", include_text_block=True)
with patch.dict("sys.modules", {"anthropic": mock_anthropic}):
result = _synthesize_rationales([{"ticker": "AAPL", "research_score": 82.0, "conviction": "rising"}])
# Template fallback populates from the context — research_score should
# be in the rendered text.
assert "AAPL" in result
assert "82" in result["AAPL"]
assert "rising" in result["AAPL"]

def test_malformed_tool_input_falls_back_to_template(self):
# tool_use block present but input doesn't match the Pydantic schema.
mock_anthropic, _ = self._make_mock_anthropic({"narratives": [{"ticker": "AAPL"}]}) # missing 'narrative'
with patch.dict("sys.modules", {"anthropic": mock_anthropic}):
result = _synthesize_rationales([{"ticker": "AAPL", "research_score": 90.0}])
# Template fallback fires; AAPL is still rendered from the context.
assert "AAPL" in result
assert "90" in result["AAPL"]

def test_empty_contexts_short_circuits(self):
# Empty input never calls the SDK — verify by leaving anthropic
# unmocked; a real import would still resolve but not be invoked.
assert _synthesize_rationales([]) == {}


class TestLoadConstituentsSectorMap:
"""Sector enrichment fallback reads latest weekly constituents.json."""

Expand Down
Loading