From b4fcb7975fecf8145758ed7aef76b113ab35e0fe Mon Sep 17 00:00:00 2001 From: Injae <15044917+dd3ok@users.noreply.github.com> Date: Mon, 15 Jun 2026 09:31:31 +0900 Subject: [PATCH 1/2] Add lightweight watchlist trigger evals --- docs/runtime-smoke.md | 2 +- evals/check_semantic_cases.py | 133 ++++++++++++++++++++++++++++++- evals/test_check_watchlist.py | 100 ++++++++++++++++++++++++ evals/trigger_cases.json | 142 ++++++++++++++++++++++++++++++++++ 4 files changed, 375 insertions(+), 2 deletions(-) create mode 100644 evals/trigger_cases.json diff --git a/docs/runtime-smoke.md b/docs/runtime-smoke.md index c99219d..9dccb46 100644 --- a/docs/runtime-smoke.md +++ b/docs/runtime-smoke.md @@ -1,6 +1,6 @@ # Runtime Smoke Matrix -This file tracks manual smoke checks in real agent runtimes. Record only real runtime results; do not mark a row as pass based on README guidance or CI alone. +This file tracks manual smoke checks in real agent runtimes. Record only real runtime results; do not mark a row as pass based on README guidance or CI alone. Do not store transcripts, screenshots, raw logs, or long runtime output. ## Matrix diff --git a/evals/check_semantic_cases.py b/evals/check_semantic_cases.py index 8073b46..056fef9 100644 --- a/evals/check_semantic_cases.py +++ b/evals/check_semantic_cases.py @@ -17,6 +17,7 @@ PROMPTS_CSV = ROOT / "evals" / "prompts.csv" SELF_CHECKS = ROOT / "evals" / "self_checks.yaml" CHECK_WATCHLIST = ROOT / "evals" / "check_watchlist.py" +TRIGGER_CASES = ROOT / "evals" / "trigger_cases.json" AUTONOMOUS_REMINDER_FORBIDDEN = { "I'll remind you", @@ -62,6 +63,48 @@ "storage-policy", "agent-workflow-safety", } +SUPPORTED_TRIGGER_REASONS = { + "ambiguous_watchlist_target", + "explicit_watchlist_add", + "generic_delete_without_watchlist", + "generic_lifecycle_without_watchlist", + "generic_now_check_without_watchlist", + "generic_reminder_without_watchlist", + "local_private_watchlist_record", + "non_watchlist_wl_text", + "preauthorized_watchlist_workflow", + "scheduler_without_watchlist", + "secret_storage_without_watchlist", + "watchlist_list_review", + "watchlist_scoped_pending_result", + "wl_item_lifecycle_update", +} +REQUIRED_TRIGGER_REASONS = { + "explicit_watchlist_add", + "wl_item_lifecycle_update", + "watchlist_list_review", + "generic_reminder_without_watchlist", + "generic_now_check_without_watchlist", + "generic_lifecycle_without_watchlist", + "non_watchlist_wl_text", +} +TRIGGER_CASE_KEYS = {"id", "locale", "prompt", "expected", "reason"} +TRIGGER_REASON_EXPECTED = { + "ambiguous_watchlist_target": "trigger", + "explicit_watchlist_add": "trigger", + "generic_delete_without_watchlist": "no_trigger", + "generic_lifecycle_without_watchlist": "no_trigger", + "generic_now_check_without_watchlist": "no_trigger", + "generic_reminder_without_watchlist": "no_trigger", + "local_private_watchlist_record": "trigger", + "non_watchlist_wl_text": "no_trigger", + "preauthorized_watchlist_workflow": "trigger", + "scheduler_without_watchlist": "no_trigger", + "secret_storage_without_watchlist": "no_trigger", + "watchlist_list_review": "trigger", + "watchlist_scoped_pending_result": "trigger", + "wl_item_lifecycle_update": "trigger", +} def fail(message: str) -> int: @@ -652,6 +695,89 @@ def validate_case( validate_storage_contract(case_id, case, expected, errors) +def validate_trigger_case_list(cases: object, errors: list[str]) -> int: + if not isinstance(cases, list): + errors.append("trigger_cases.json: root value must be a list") + return 0 + if not 20 <= len(cases) <= 30: + errors.append("trigger_cases.json: expected 20 to 30 lightweight cases") + + seen_ids: set[str] = set() + decisions = {"trigger": 0, "no_trigger": 0} + reasons: set[str] = set() + for index, case in enumerate(cases): + case_id = f"trigger_cases[{index}]" + if not isinstance(case, dict): + errors.append(f"{case_id}: case must be an object") + continue + + extra_keys = sorted(set(case) - TRIGGER_CASE_KEYS) + if extra_keys: + errors.append(f"{case_id}: unsupported key(s): {', '.join(extra_keys)}") + + require_keys(case, TRIGGER_CASE_KEYS, case_id, errors, "trigger case") + if not TRIGGER_CASE_KEYS.issubset(case): + continue + + case_id = str(case["id"]) + if case_id in seen_ids: + errors.append(f"{case_id}: duplicate trigger case id") + seen_ids.add(case_id) + + if case.get("locale") not in {"ko", "en", "mixed"}: + errors.append(f"{case_id}: locale must be ko, en, or mixed") + + prompt = case.get("prompt") + if not isinstance(prompt, str) or not prompt.strip(): + errors.append(f"{case_id}: prompt must be a non-empty string") + elif len(prompt) > 180: + errors.append(f"{case_id}: prompt is too long for lightweight trigger eval") + + expected = case.get("expected") + if expected not in decisions: + errors.append(f"{case_id}: expected must be trigger or no_trigger") + else: + decisions[str(expected)] += 1 + + reason = case.get("reason") + if reason not in SUPPORTED_TRIGGER_REASONS: + errors.append(f"{case_id}: unsupported trigger reason: {reason}") + else: + reasons.add(str(reason)) + expected_for_reason = TRIGGER_REASON_EXPECTED[str(reason)] + if expected != expected_for_reason: + errors.append( + f"{case_id}: reason {reason} must use expected={expected_for_reason}" + ) + + for decision, count in decisions.items(): + if count < 8: + errors.append(f"trigger_cases.json: expected at least 8 {decision} cases") + + missing_reasons = sorted(REQUIRED_TRIGGER_REASONS - reasons) + if missing_reasons: + errors.append( + "trigger_cases.json: missing required trigger reason(s): " + + ", ".join(missing_reasons) + ) + + return len(cases) + + +def validate_trigger_cases(errors: list[str]) -> int: + if not TRIGGER_CASES.is_file(): + errors.append(f"Missing trigger eval corpus: {TRIGGER_CASES.relative_to(ROOT)}") + return 0 + + try: + cases = json.loads(TRIGGER_CASES.read_text(encoding="utf-8")) + except json.JSONDecodeError as exc: + errors.append(f"trigger_cases.json: invalid JSON: {exc}") + return 0 + + return validate_trigger_case_list(cases, errors) + + def main() -> int: errors: list[str] = [] if not CASES_DIR.is_dir(): @@ -689,10 +815,15 @@ def main() -> int: f"Missing semantic case(s) for self_checks.yaml: {', '.join(missing_self_check_cases)}" ) + trigger_case_count = validate_trigger_cases(errors) + if errors: return fail("Semantic case check failed:\n" + "\n".join(f"- {error}" for error in errors)) - print(f"Semantic case check passed: {len(case_paths)} case(s)") + print( + f"Semantic case check passed: {len(case_paths)} case(s); " + f"{trigger_case_count} trigger case(s)" + ) return 0 diff --git a/evals/test_check_watchlist.py b/evals/test_check_watchlist.py index 17f539a..63e97d5 100644 --- a/evals/test_check_watchlist.py +++ b/evals/test_check_watchlist.py @@ -21,6 +21,7 @@ SEMANTIC_SCRIPT = REPO_ROOT / "evals" / "check_semantic_cases.py" PACKAGE_SCRIPT = REPO_ROOT / "evals" / "check_skill_package.py" REPO_VALIDATOR = REPO_ROOT / "tools" / "validate_watchlist.py" +TRIGGER_CASES = REPO_ROOT / "evals" / "trigger_cases.json" _SEMANTIC_SPEC = importlib.util.spec_from_file_location( "check_semantic_cases", SEMANTIC_SCRIPT @@ -941,7 +942,53 @@ def test_runtime_smoke_doc_tracks_pending_vendor_matrix(self): self.assertIn(runtime, text) self.assertIn("pending", text) self.assertIn("Record only real runtime results", text) + self.assertIn("Do not store transcripts, screenshots, raw logs, or long runtime output.", text) self.assertIn("without a bundled Python validator", text) + self.assertLessEqual(len(text.splitlines()), 35) + + def test_trigger_eval_corpus_is_small_balanced_and_deterministic(self): + cases = json.loads(TRIGGER_CASES.read_text(encoding="utf-8")) + + self.assertGreaterEqual(len(cases), 20) + self.assertLessEqual(len(cases), 30) + + decisions = {case["expected"] for case in cases} + self.assertEqual(decisions, {"trigger", "no_trigger"}) + self.assertGreaterEqual( + sum(1 for case in cases if case["expected"] == "trigger"), + 8, + ) + self.assertGreaterEqual( + sum(1 for case in cases if case["expected"] == "no_trigger"), + 8, + ) + + reasons = {case["reason"] for case in cases} + for reason in [ + "explicit_watchlist_add", + "wl_item_lifecycle_update", + "watchlist_list_review", + "generic_reminder_without_watchlist", + "generic_now_check_without_watchlist", + "generic_lifecycle_without_watchlist", + "non_watchlist_wl_text", + ]: + self.assertIn(reason, reasons) + + forbidden_fields = { + "actual", + "response", + "runtime", + "runtime_output", + "transcript", + "screenshot", + "logs", + } + for case in cases: + with self.subTest(case=case["id"]): + self.assertEqual(set(case), {"id", "locale", "prompt", "expected", "reason"}) + self.assertFalse(forbidden_fields.intersection(case)) + self.assertLessEqual(len(case["prompt"]), 180) def test_starter_templates_label_commented_item_as_example_only(self): paths = [ @@ -1063,6 +1110,58 @@ def test_semantic_case_validation_rejects_unknown_category(self): self.assertIn("sample-case: category is unsupported: workflow-safety", errors) + def test_trigger_case_validation_rejects_reason_polarity_drift(self): + cases = [ + { + "id": f"trigger-sample-{index}", + "locale": "en", + "prompt": f"Add this to WATCHLIST.md for sample {index}.", + "expected": "trigger", + "reason": "explicit_watchlist_add", + } + for index in range(9) + ] + cases.extend( + { + "id": f"no-trigger-sample-{index}", + "locale": "en", + "prompt": f"Remind me about sample {index} tomorrow.", + "expected": "no_trigger", + "reason": "generic_reminder_without_watchlist", + } + for index in range(9) + ) + cases.extend( + [ + { + "id": "wrong-polarity-trigger", + "locale": "en", + "prompt": "Remind me tomorrow at 9.", + "expected": "trigger", + "reason": "generic_reminder_without_watchlist", + }, + { + "id": "wrong-polarity-no-trigger", + "locale": "en", + "prompt": "Add this to WATCHLIST.md.", + "expected": "no_trigger", + "reason": "explicit_watchlist_add", + }, + ] + ) + errors = [] + + SEMANTIC_CASES.validate_trigger_case_list(cases, errors) + + self.assertIn( + "wrong-polarity-trigger: reason generic_reminder_without_watchlist must use expected=no_trigger", + errors, + ) + self.assertIn( + "wrong-polarity-no-trigger: reason explicit_watchlist_add must use expected=trigger", + errors, + ) + def test_false_trigger_semantic_case_validates_optional_must_not_list(self): case = { "id": "sample-case", @@ -1115,6 +1214,7 @@ def test_semantic_case_checker_passes(self): self.assertEqual(result.returncode, 0, result.stderr + result.stdout) self.assertIn("Semantic case check passed", result.stdout) + self.assertIn("trigger case(s)", result.stdout) def test_semantic_self_check_parser_supports_single_quoted_prompts(self): text = """cases: diff --git a/evals/trigger_cases.json b/evals/trigger_cases.json new file mode 100644 index 0000000..51415ea --- /dev/null +++ b/evals/trigger_cases.json @@ -0,0 +1,142 @@ +[ + { + "id": "trigger-explicit-watchlist-add-en", + "locale": "en", + "prompt": "Add this to WATCHLIST.md. Check PR #25 CI at 17:00.", + "expected": "trigger", + "reason": "explicit_watchlist_add" + }, + { + "id": "trigger-explicit-watchlist-add-kr", + "locale": "ko", + "prompt": "WATCHLIST.md에 추가해줘. 오늘 17시에 배포 확인.", + "expected": "trigger", + "reason": "explicit_watchlist_add" + }, + { + "id": "trigger-wl-complete-en", + "locale": "en", + "prompt": "Mark WL-20260614-001 done; CI passed.", + "expected": "trigger", + "reason": "wl_item_lifecycle_update" + }, + { + "id": "trigger-wl-drop-kr", + "locale": "ko", + "prompt": "WL-20260614-002 드롭 처리해. 더 이상 필요 없어.", + "expected": "trigger", + "reason": "wl_item_lifecycle_update" + }, + { + "id": "trigger-watchlist-review-en", + "locale": "en", + "prompt": "Review WATCHLIST.md items due today.", + "expected": "trigger", + "reason": "watchlist_list_review" + }, + { + "id": "trigger-watchlist-review-kr", + "locale": "ko", + "prompt": "WATCHLIST.md 목록만 보여줘.", + "expected": "trigger", + "reason": "watchlist_list_review" + }, + { + "id": "trigger-local-private-watchlist-en", + "locale": "en", + "prompt": "Add this to watchlist as a local note. Check test logs at 18:00.", + "expected": "trigger", + "reason": "local_private_watchlist_record" + }, + { + "id": "trigger-ambiguous-target-en", + "locale": "en", + "prompt": "Add this to watchlist. Check deployment at 17:00.", + "expected": "trigger", + "reason": "ambiguous_watchlist_target" + }, + { + "id": "trigger-preauthorized-workflow-en", + "locale": "en", + "prompt": "For this repo's watchlist workflow, record the nightly sync check tomorrow.", + "expected": "trigger", + "reason": "preauthorized_watchlist_workflow" + }, + { + "id": "trigger-watchlist-scoped-pending-en", + "locale": "en", + "prompt": "The WATCHLIST follow-up for PR review is pending; record checking it after CI completes.", + "expected": "trigger", + "reason": "watchlist_scoped_pending_result" + }, + { + "id": "no-trigger-generic-reminder-en", + "locale": "en", + "prompt": "Remind me tomorrow at 9 to check deployment.", + "expected": "no_trigger", + "reason": "generic_reminder_without_watchlist" + }, + { + "id": "no-trigger-generic-reminder-kr", + "locale": "ko", + "prompt": "내일 9시에 배포 확인하라고 리마인드해줘.", + "expected": "no_trigger", + "reason": "generic_reminder_without_watchlist" + }, + { + "id": "no-trigger-now-check-en", + "locale": "en", + "prompt": "Check the GitHub Actions result now.", + "expected": "no_trigger", + "reason": "generic_now_check_without_watchlist" + }, + { + "id": "no-trigger-now-check-kr", + "locale": "ko", + "prompt": "지금 배포 결과 확인해줘.", + "expected": "no_trigger", + "reason": "generic_now_check_without_watchlist" + }, + { + "id": "no-trigger-generic-delete-en", + "locale": "en", + "prompt": "Delete README.md.", + "expected": "no_trigger", + "reason": "generic_delete_without_watchlist" + }, + { + "id": "no-trigger-generic-complete-en", + "locale": "en", + "prompt": "The refactor is complete.", + "expected": "no_trigger", + "reason": "generic_lifecycle_without_watchlist" + }, + { + "id": "no-trigger-generic-cancel-kr", + "locale": "ko", + "prompt": "방금 말한 작업 취소해줘.", + "expected": "no_trigger", + "reason": "generic_lifecycle_without_watchlist" + }, + { + "id": "no-trigger-non-watchlist-wl-en", + "locale": "en", + "prompt": "The WL-ABC warehouse label is obsolete; delete it.", + "expected": "no_trigger", + "reason": "non_watchlist_wl_text" + }, + { + "id": "no-trigger-scheduler-without-watchlist-en", + "locale": "en", + "prompt": "Schedule a calendar reminder to review CI tomorrow.", + "expected": "no_trigger", + "reason": "scheduler_without_watchlist" + }, + { + "id": "no-trigger-secret-storage-without-watchlist-en", + "locale": "en", + "prompt": "Store my token abc123 and use it later.", + "expected": "no_trigger", + "reason": "secret_storage_without_watchlist" + } +] From 41760de7465f52e0be91b771396605221e89cf2e Mon Sep 17 00:00:00 2001 From: Injae <15044917+dd3ok@users.noreply.github.com> Date: Mon, 15 Jun 2026 19:34:48 +0900 Subject: [PATCH 2/2] Validate trigger case ids --- evals/check_semantic_cases.py | 10 ++++++- evals/test_check_watchlist.py | 49 +++++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 1 deletion(-) diff --git a/evals/check_semantic_cases.py b/evals/check_semantic_cases.py index 056fef9..1836694 100644 --- a/evals/check_semantic_cases.py +++ b/evals/check_semantic_cases.py @@ -719,7 +719,15 @@ def validate_trigger_case_list(cases: object, errors: list[str]) -> int: if not TRIGGER_CASE_KEYS.issubset(case): continue - case_id = str(case["id"]) + case_id_val = case.get("id") + if not isinstance(case_id_val, str) or not case_id_val.strip(): + errors.append(f"{case_id}: id must be a non-empty string") + continue + if case_id_val != case_id_val.strip(): + errors.append(f"{case_id}: id must not have leading or trailing whitespace") + continue + + case_id = case_id_val if case_id in seen_ids: errors.append(f"{case_id}: duplicate trigger case id") seen_ids.add(case_id) diff --git a/evals/test_check_watchlist.py b/evals/test_check_watchlist.py index 63e97d5..91ab810 100644 --- a/evals/test_check_watchlist.py +++ b/evals/test_check_watchlist.py @@ -1162,6 +1162,55 @@ def test_trigger_case_validation_rejects_reason_polarity_drift(self): errors, ) + def test_trigger_case_validation_rejects_invalid_id(self): + cases = [ + { + "id": f"trigger-sample-{index}", + "locale": "en", + "prompt": f"Add this to WATCHLIST.md for sample {index}.", + "expected": "trigger", + "reason": "explicit_watchlist_add", + } + for index in range(9) + ] + cases.extend( + { + "id": f"no-trigger-sample-{index}", + "locale": "en", + "prompt": f"Remind me about sample {index} tomorrow.", + "expected": "no_trigger", + "reason": "generic_reminder_without_watchlist", + } + for index in range(9) + ) + cases.extend( + [ + { + "id": 123, + "locale": "en", + "prompt": "Add this to WATCHLIST.md.", + "expected": "trigger", + "reason": "explicit_watchlist_add", + }, + { + "id": " whitespace-id ", + "locale": "en", + "prompt": "Remind me tomorrow at 9.", + "expected": "no_trigger", + "reason": "generic_reminder_without_watchlist", + }, + ] + ) + errors = [] + + SEMANTIC_CASES.validate_trigger_case_list(cases, errors) + + self.assertIn("trigger_cases[18]: id must be a non-empty string", errors) + self.assertIn( + "trigger_cases[19]: id must not have leading or trailing whitespace", + errors, + ) + def test_false_trigger_semantic_case_validates_optional_must_not_list(self): case = { "id": "sample-case",