Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/runtime-smoke.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Runtime Smoke Matrix

This file tracks manual smoke checks in real agent runtimes. Record only real runtime results; do not mark a row as pass based on README guidance or CI alone.
This file tracks manual smoke checks in real agent runtimes. Record only real runtime results; do not mark a row as pass based on README guidance or CI alone. Do not store transcripts, screenshots, raw logs, or long runtime output.

## Matrix

Expand Down
141 changes: 140 additions & 1 deletion evals/check_semantic_cases.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
PROMPTS_CSV = ROOT / "evals" / "prompts.csv"
SELF_CHECKS = ROOT / "evals" / "self_checks.yaml"
CHECK_WATCHLIST = ROOT / "evals" / "check_watchlist.py"
TRIGGER_CASES = ROOT / "evals" / "trigger_cases.json"

AUTONOMOUS_REMINDER_FORBIDDEN = {
"I'll remind you",
Expand Down Expand Up @@ -62,6 +63,48 @@
"storage-policy",
"agent-workflow-safety",
}
SUPPORTED_TRIGGER_REASONS = {
"ambiguous_watchlist_target",
"explicit_watchlist_add",
"generic_delete_without_watchlist",
"generic_lifecycle_without_watchlist",
"generic_now_check_without_watchlist",
"generic_reminder_without_watchlist",
"local_private_watchlist_record",
"non_watchlist_wl_text",
"preauthorized_watchlist_workflow",
"scheduler_without_watchlist",
"secret_storage_without_watchlist",
"watchlist_list_review",
"watchlist_scoped_pending_result",
"wl_item_lifecycle_update",
}
REQUIRED_TRIGGER_REASONS = {
"explicit_watchlist_add",
"wl_item_lifecycle_update",
"watchlist_list_review",
"generic_reminder_without_watchlist",
"generic_now_check_without_watchlist",
"generic_lifecycle_without_watchlist",
"non_watchlist_wl_text",
}
TRIGGER_CASE_KEYS = {"id", "locale", "prompt", "expected", "reason"}
TRIGGER_REASON_EXPECTED = {
"ambiguous_watchlist_target": "trigger",
"explicit_watchlist_add": "trigger",
"generic_delete_without_watchlist": "no_trigger",
"generic_lifecycle_without_watchlist": "no_trigger",
"generic_now_check_without_watchlist": "no_trigger",
"generic_reminder_without_watchlist": "no_trigger",
"local_private_watchlist_record": "trigger",
"non_watchlist_wl_text": "no_trigger",
"preauthorized_watchlist_workflow": "trigger",
"scheduler_without_watchlist": "no_trigger",
"secret_storage_without_watchlist": "no_trigger",
"watchlist_list_review": "trigger",
"watchlist_scoped_pending_result": "trigger",
"wl_item_lifecycle_update": "trigger",
}


def fail(message: str) -> int:
Expand Down Expand Up @@ -652,6 +695,97 @@ def validate_case(
validate_storage_contract(case_id, case, expected, errors)


def validate_trigger_case_list(cases: object, errors: list[str]) -> int:
if not isinstance(cases, list):
errors.append("trigger_cases.json: root value must be a list")
return 0
if not 20 <= len(cases) <= 30:
errors.append("trigger_cases.json: expected 20 to 30 lightweight cases")

seen_ids: set[str] = set()
decisions = {"trigger": 0, "no_trigger": 0}
reasons: set[str] = set()
for index, case in enumerate(cases):
case_id = f"trigger_cases[{index}]"
if not isinstance(case, dict):
errors.append(f"{case_id}: case must be an object")
continue

extra_keys = sorted(set(case) - TRIGGER_CASE_KEYS)
if extra_keys:
errors.append(f"{case_id}: unsupported key(s): {', '.join(extra_keys)}")

require_keys(case, TRIGGER_CASE_KEYS, case_id, errors, "trigger case")
if not TRIGGER_CASE_KEYS.issubset(case):
continue

case_id_val = case.get("id")
if not isinstance(case_id_val, str) or not case_id_val.strip():
errors.append(f"{case_id}: id must be a non-empty string")
continue
if case_id_val != case_id_val.strip():
errors.append(f"{case_id}: id must not have leading or trailing whitespace")
continue

case_id = case_id_val
if case_id in seen_ids:
errors.append(f"{case_id}: duplicate trigger case id")
seen_ids.add(case_id)

if case.get("locale") not in {"ko", "en", "mixed"}:
errors.append(f"{case_id}: locale must be ko, en, or mixed")

prompt = case.get("prompt")
if not isinstance(prompt, str) or not prompt.strip():
errors.append(f"{case_id}: prompt must be a non-empty string")
elif len(prompt) > 180:
errors.append(f"{case_id}: prompt is too long for lightweight trigger eval")

expected = case.get("expected")
if expected not in decisions:
errors.append(f"{case_id}: expected must be trigger or no_trigger")
else:
decisions[str(expected)] += 1

reason = case.get("reason")
if reason not in SUPPORTED_TRIGGER_REASONS:
errors.append(f"{case_id}: unsupported trigger reason: {reason}")
else:
reasons.add(str(reason))
expected_for_reason = TRIGGER_REASON_EXPECTED[str(reason)]
if expected != expected_for_reason:
errors.append(
f"{case_id}: reason {reason} must use expected={expected_for_reason}"
)

for decision, count in decisions.items():
if count < 8:
errors.append(f"trigger_cases.json: expected at least 8 {decision} cases")

missing_reasons = sorted(REQUIRED_TRIGGER_REASONS - reasons)
if missing_reasons:
errors.append(
"trigger_cases.json: missing required trigger reason(s): "
+ ", ".join(missing_reasons)
)

return len(cases)


def validate_trigger_cases(errors: list[str]) -> int:
if not TRIGGER_CASES.is_file():
errors.append(f"Missing trigger eval corpus: {TRIGGER_CASES.relative_to(ROOT)}")
return 0

try:
cases = json.loads(TRIGGER_CASES.read_text(encoding="utf-8"))
except json.JSONDecodeError as exc:
errors.append(f"trigger_cases.json: invalid JSON: {exc}")
return 0

return validate_trigger_case_list(cases, errors)


def main() -> int:
errors: list[str] = []
if not CASES_DIR.is_dir():
Expand Down Expand Up @@ -689,10 +823,15 @@ def main() -> int:
f"Missing semantic case(s) for self_checks.yaml: {', '.join(missing_self_check_cases)}"
)

trigger_case_count = validate_trigger_cases(errors)

if errors:
return fail("Semantic case check failed:\n" + "\n".join(f"- {error}" for error in errors))

print(f"Semantic case check passed: {len(case_paths)} case(s)")
print(
f"Semantic case check passed: {len(case_paths)} case(s); "
f"{trigger_case_count} trigger case(s)"
)
return 0


Expand Down
149 changes: 149 additions & 0 deletions evals/test_check_watchlist.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
SEMANTIC_SCRIPT = REPO_ROOT / "evals" / "check_semantic_cases.py"
PACKAGE_SCRIPT = REPO_ROOT / "evals" / "check_skill_package.py"
REPO_VALIDATOR = REPO_ROOT / "tools" / "validate_watchlist.py"
TRIGGER_CASES = REPO_ROOT / "evals" / "trigger_cases.json"

_SEMANTIC_SPEC = importlib.util.spec_from_file_location(
"check_semantic_cases", SEMANTIC_SCRIPT
Expand Down Expand Up @@ -941,7 +942,53 @@ def test_runtime_smoke_doc_tracks_pending_vendor_matrix(self):
self.assertIn(runtime, text)
self.assertIn("pending", text)
self.assertIn("Record only real runtime results", text)
self.assertIn("Do not store transcripts, screenshots, raw logs, or long runtime output.", text)
self.assertIn("without a bundled Python validator", text)
self.assertLessEqual(len(text.splitlines()), 35)

def test_trigger_eval_corpus_is_small_balanced_and_deterministic(self):
cases = json.loads(TRIGGER_CASES.read_text(encoding="utf-8"))

self.assertGreaterEqual(len(cases), 20)
self.assertLessEqual(len(cases), 30)

decisions = {case["expected"] for case in cases}
self.assertEqual(decisions, {"trigger", "no_trigger"})
self.assertGreaterEqual(
sum(1 for case in cases if case["expected"] == "trigger"),
8,
)
self.assertGreaterEqual(
sum(1 for case in cases if case["expected"] == "no_trigger"),
8,
)

reasons = {case["reason"] for case in cases}
for reason in [
"explicit_watchlist_add",
"wl_item_lifecycle_update",
"watchlist_list_review",
"generic_reminder_without_watchlist",
"generic_now_check_without_watchlist",
"generic_lifecycle_without_watchlist",
"non_watchlist_wl_text",
]:
self.assertIn(reason, reasons)

forbidden_fields = {
"actual",
"response",
"runtime",
"runtime_output",
"transcript",
"screenshot",
"logs",
}
for case in cases:
with self.subTest(case=case["id"]):
self.assertEqual(set(case), {"id", "locale", "prompt", "expected", "reason"})
self.assertFalse(forbidden_fields.intersection(case))
self.assertLessEqual(len(case["prompt"]), 180)

def test_starter_templates_label_commented_item_as_example_only(self):
paths = [
Expand Down Expand Up @@ -1063,6 +1110,107 @@ def test_semantic_case_validation_rejects_unknown_category(self):

self.assertIn("sample-case: category is unsupported: workflow-safety", errors)

def test_trigger_case_validation_rejects_reason_polarity_drift(self):
cases = [
{
"id": f"trigger-sample-{index}",
"locale": "en",
"prompt": f"Add this to WATCHLIST.md for sample {index}.",
"expected": "trigger",
"reason": "explicit_watchlist_add",
}
for index in range(9)
]
cases.extend(
{
"id": f"no-trigger-sample-{index}",
"locale": "en",
"prompt": f"Remind me about sample {index} tomorrow.",
"expected": "no_trigger",
"reason": "generic_reminder_without_watchlist",
}
for index in range(9)
)
cases.extend(
[
{
"id": "wrong-polarity-trigger",
"locale": "en",
"prompt": "Remind me tomorrow at 9.",
"expected": "trigger",
"reason": "generic_reminder_without_watchlist",
},
{
"id": "wrong-polarity-no-trigger",
"locale": "en",
"prompt": "Add this to WATCHLIST.md.",
"expected": "no_trigger",
"reason": "explicit_watchlist_add",
},
]
)
errors = []

SEMANTIC_CASES.validate_trigger_case_list(cases, errors)

self.assertIn(
"wrong-polarity-trigger: reason generic_reminder_without_watchlist must use expected=no_trigger",
errors,
)
self.assertIn(
"wrong-polarity-no-trigger: reason explicit_watchlist_add must use expected=trigger",
errors,
)

def test_trigger_case_validation_rejects_invalid_id(self):
cases = [
{
"id": f"trigger-sample-{index}",
"locale": "en",
"prompt": f"Add this to WATCHLIST.md for sample {index}.",
"expected": "trigger",
"reason": "explicit_watchlist_add",
}
for index in range(9)
]
cases.extend(
{
"id": f"no-trigger-sample-{index}",
"locale": "en",
"prompt": f"Remind me about sample {index} tomorrow.",
"expected": "no_trigger",
"reason": "generic_reminder_without_watchlist",
}
for index in range(9)
)
cases.extend(
[
{
"id": 123,
"locale": "en",
"prompt": "Add this to WATCHLIST.md.",
"expected": "trigger",
"reason": "explicit_watchlist_add",
},
{
"id": " whitespace-id ",
"locale": "en",
"prompt": "Remind me tomorrow at 9.",
"expected": "no_trigger",
"reason": "generic_reminder_without_watchlist",
},
]
)
errors = []

SEMANTIC_CASES.validate_trigger_case_list(cases, errors)

self.assertIn("trigger_cases[18]: id must be a non-empty string", errors)
self.assertIn(
"trigger_cases[19]: id must not have leading or trailing whitespace",
errors,
)

def test_false_trigger_semantic_case_validates_optional_must_not_list(self):
case = {
"id": "sample-case",
Expand Down Expand Up @@ -1115,6 +1263,7 @@ def test_semantic_case_checker_passes(self):

self.assertEqual(result.returncode, 0, result.stderr + result.stdout)
self.assertIn("Semantic case check passed", result.stdout)
self.assertIn("trigger case(s)", result.stdout)

def test_semantic_self_check_parser_supports_single_quoted_prompts(self):
text = """cases:
Expand Down
Loading
Loading