From b4fcb7975fecf8145758ed7aef76b113ab35e0fe Mon Sep 17 00:00:00 2001
From: Injae <15044917+dd3ok@users.noreply.github.com>
Date: Mon, 15 Jun 2026 09:31:31 +0900
Subject: [PATCH 1/2] Add lightweight watchlist trigger evals

---
 docs/runtime-smoke.md         |   2 +-
 evals/check_semantic_cases.py | 133 ++++++++++++++++++++++++++++++-
 evals/test_check_watchlist.py | 100 ++++++++++++++++++++++++
 evals/trigger_cases.json      | 142 ++++++++++++++++++++++++++++++++++
 4 files changed, 375 insertions(+), 2 deletions(-)
 create mode 100644 evals/trigger_cases.json

diff --git a/docs/runtime-smoke.md b/docs/runtime-smoke.md
index c99219d..9dccb46 100644
--- a/docs/runtime-smoke.md
+++ b/docs/runtime-smoke.md
@@ -1,6 +1,6 @@
 # Runtime Smoke Matrix
 
-This file tracks manual smoke checks in real agent runtimes. Record only real runtime results; do not mark a row as pass based on README guidance or CI alone.
+This file tracks manual smoke checks in real agent runtimes. Record only real runtime results; do not mark a row as pass based on README guidance or CI alone. Do not store transcripts, screenshots, raw logs, or long runtime output.
 
 ## Matrix
 
diff --git a/evals/check_semantic_cases.py b/evals/check_semantic_cases.py
index 8073b46..056fef9 100644
--- a/evals/check_semantic_cases.py
+++ b/evals/check_semantic_cases.py
@@ -17,6 +17,7 @@
 PROMPTS_CSV = ROOT / "evals" / "prompts.csv"
 SELF_CHECKS = ROOT / "evals" / "self_checks.yaml"
 CHECK_WATCHLIST = ROOT / "evals" / "check_watchlist.py"
+TRIGGER_CASES = ROOT / "evals" / "trigger_cases.json"
 
 AUTONOMOUS_REMINDER_FORBIDDEN = {
     "I'll remind you",
@@ -62,6 +63,48 @@
     "storage-policy",
     "agent-workflow-safety",
 }
+SUPPORTED_TRIGGER_REASONS = {
+    "ambiguous_watchlist_target",
+    "explicit_watchlist_add",
+    "generic_delete_without_watchlist",
+    "generic_lifecycle_without_watchlist",
+    "generic_now_check_without_watchlist",
+    "generic_reminder_without_watchlist",
+    "local_private_watchlist_record",
+    "non_watchlist_wl_text",
+    "preauthorized_watchlist_workflow",
+    "scheduler_without_watchlist",
+    "secret_storage_without_watchlist",
+    "watchlist_list_review",
+    "watchlist_scoped_pending_result",
+    "wl_item_lifecycle_update",
+}
+REQUIRED_TRIGGER_REASONS = {
+    "explicit_watchlist_add",
+    "wl_item_lifecycle_update",
+    "watchlist_list_review",
+    "generic_reminder_without_watchlist",
+    "generic_now_check_without_watchlist",
+    "generic_lifecycle_without_watchlist",
+    "non_watchlist_wl_text",
+}
+TRIGGER_CASE_KEYS = {"id", "locale", "prompt", "expected", "reason"}
+TRIGGER_REASON_EXPECTED = {
+    "ambiguous_watchlist_target": "trigger",
+    "explicit_watchlist_add": "trigger",
+    "generic_delete_without_watchlist": "no_trigger",
+    "generic_lifecycle_without_watchlist": "no_trigger",
+    "generic_now_check_without_watchlist": "no_trigger",
+    "generic_reminder_without_watchlist": "no_trigger",
+    "local_private_watchlist_record": "trigger",
+    "non_watchlist_wl_text": "no_trigger",
+    "preauthorized_watchlist_workflow": "trigger",
+    "scheduler_without_watchlist": "no_trigger",
+    "secret_storage_without_watchlist": "no_trigger",
+    "watchlist_list_review": "trigger",
+    "watchlist_scoped_pending_result": "trigger",
+    "wl_item_lifecycle_update": "trigger",
+}
 
 
 def fail(message: str) -> int:
@@ -652,6 +695,89 @@ def validate_case(
     validate_storage_contract(case_id, case, expected, errors)
 
 
+def validate_trigger_case_list(cases: object, errors: list[str]) -> int:
+    if not isinstance(cases, list):
+        errors.append("trigger_cases.json: root value must be a list")
+        return 0
+    if not 20 <= len(cases) <= 30:
+        errors.append("trigger_cases.json: expected 20 to 30 lightweight cases")
+
+    seen_ids: set[str] = set()
+    decisions = {"trigger": 0, "no_trigger": 0}
+    reasons: set[str] = set()
+    for index, case in enumerate(cases):
+        case_id = f"trigger_cases[{index}]"
+        if not isinstance(case, dict):
+            errors.append(f"{case_id}: case must be an object")
+            continue
+
+        extra_keys = sorted(set(case) - TRIGGER_CASE_KEYS)
+        if extra_keys:
+            errors.append(f"{case_id}: unsupported key(s): {', '.join(extra_keys)}")
+
+        require_keys(case, TRIGGER_CASE_KEYS, case_id, errors, "trigger case")
+        if not TRIGGER_CASE_KEYS.issubset(case):
+            continue
+
+        case_id = str(case["id"])
+        if case_id in seen_ids:
+            errors.append(f"{case_id}: duplicate trigger case id")
+        seen_ids.add(case_id)
+
+        if case.get("locale") not in {"ko", "en", "mixed"}:
+            errors.append(f"{case_id}: locale must be ko, en, or mixed")
+
+        prompt = case.get("prompt")
+        if not isinstance(prompt, str) or not prompt.strip():
+            errors.append(f"{case_id}: prompt must be a non-empty string")
+        elif len(prompt) > 180:
+            errors.append(f"{case_id}: prompt is too long for lightweight trigger eval")
+
+        expected = case.get("expected")
+        if expected not in decisions:
+            errors.append(f"{case_id}: expected must be trigger or no_trigger")
+        else:
+            decisions[str(expected)] += 1
+
+        reason = case.get("reason")
+        if reason not in SUPPORTED_TRIGGER_REASONS:
+            errors.append(f"{case_id}: unsupported trigger reason: {reason}")
+        else:
+            reasons.add(str(reason))
+            expected_for_reason = TRIGGER_REASON_EXPECTED[str(reason)]
+            if expected != expected_for_reason:
+                errors.append(
+                    f"{case_id}: reason {reason} must use expected={expected_for_reason}"
+                )
+
+    for decision, count in decisions.items():
+        if count < 8:
+            errors.append(f"trigger_cases.json: expected at least 8 {decision} cases")
+
+    missing_reasons = sorted(REQUIRED_TRIGGER_REASONS - reasons)
+    if missing_reasons:
+        errors.append(
+            "trigger_cases.json: missing required trigger reason(s): "
+            + ", ".join(missing_reasons)
+        )
+
+    return len(cases)
+
+
+def validate_trigger_cases(errors: list[str]) -> int:
+    if not TRIGGER_CASES.is_file():
+        errors.append(f"Missing trigger eval corpus: {TRIGGER_CASES.relative_to(ROOT)}")
+        return 0
+
+    try:
+        cases = json.loads(TRIGGER_CASES.read_text(encoding="utf-8"))
+    except json.JSONDecodeError as exc:
+        errors.append(f"trigger_cases.json: invalid JSON: {exc}")
+        return 0
+
+    return validate_trigger_case_list(cases, errors)
+
+
 def main() -> int:
     errors: list[str] = []
     if not CASES_DIR.is_dir():
@@ -689,10 +815,15 @@ def main() -> int:
             f"Missing semantic case(s) for self_checks.yaml: {', '.join(missing_self_check_cases)}"
         )
 
+    trigger_case_count = validate_trigger_cases(errors)
+
     if errors:
         return fail("Semantic case check failed:\n" + "\n".join(f"- {error}" for error in errors))
 
-    print(f"Semantic case check passed: {len(case_paths)} case(s)")
+    print(
+        f"Semantic case check passed: {len(case_paths)} case(s); "
+        f"{trigger_case_count} trigger case(s)"
+    )
     return 0
 
 
diff --git a/evals/test_check_watchlist.py b/evals/test_check_watchlist.py
index 17f539a..63e97d5 100644
--- a/evals/test_check_watchlist.py
+++ b/evals/test_check_watchlist.py
@@ -21,6 +21,7 @@
 SEMANTIC_SCRIPT = REPO_ROOT / "evals" / "check_semantic_cases.py"
 PACKAGE_SCRIPT = REPO_ROOT / "evals" / "check_skill_package.py"
 REPO_VALIDATOR = REPO_ROOT / "tools" / "validate_watchlist.py"
+TRIGGER_CASES = REPO_ROOT / "evals" / "trigger_cases.json"
 
 _SEMANTIC_SPEC = importlib.util.spec_from_file_location(
     "check_semantic_cases", SEMANTIC_SCRIPT
@@ -941,7 +942,53 @@ def test_runtime_smoke_doc_tracks_pending_vendor_matrix(self):
             self.assertIn(runtime, text)
         self.assertIn("pending", text)
         self.assertIn("Record only real runtime results", text)
+        self.assertIn("Do not store transcripts, screenshots, raw logs, or long runtime output.", text)
         self.assertIn("without a bundled Python validator", text)
+        self.assertLessEqual(len(text.splitlines()), 35)
+
+    def test_trigger_eval_corpus_is_small_balanced_and_deterministic(self):
+        cases = json.loads(TRIGGER_CASES.read_text(encoding="utf-8"))
+
+        self.assertGreaterEqual(len(cases), 20)
+        self.assertLessEqual(len(cases), 30)
+
+        decisions = {case["expected"] for case in cases}
+        self.assertEqual(decisions, {"trigger", "no_trigger"})
+        self.assertGreaterEqual(
+            sum(1 for case in cases if case["expected"] == "trigger"),
+            8,
+        )
+        self.assertGreaterEqual(
+            sum(1 for case in cases if case["expected"] == "no_trigger"),
+            8,
+        )
+
+        reasons = {case["reason"] for case in cases}
+        for reason in [
+            "explicit_watchlist_add",
+            "wl_item_lifecycle_update",
+            "watchlist_list_review",
+            "generic_reminder_without_watchlist",
+            "generic_now_check_without_watchlist",
+            "generic_lifecycle_without_watchlist",
+            "non_watchlist_wl_text",
+        ]:
+            self.assertIn(reason, reasons)
+
+        forbidden_fields = {
+            "actual",
+            "response",
+            "runtime",
+            "runtime_output",
+            "transcript",
+            "screenshot",
+            "logs",
+        }
+        for case in cases:
+            with self.subTest(case=case["id"]):
+                self.assertEqual(set(case), {"id", "locale", "prompt", "expected", "reason"})
+                self.assertFalse(forbidden_fields.intersection(case))
+                self.assertLessEqual(len(case["prompt"]), 180)
 
     def test_starter_templates_label_commented_item_as_example_only(self):
         paths = [
@@ -1063,6 +1110,58 @@ def test_semantic_case_validation_rejects_unknown_category(self):
 
         self.assertIn("sample-case: category is unsupported: workflow-safety", errors)
 
+    def test_trigger_case_validation_rejects_reason_polarity_drift(self):
+        cases = [
+            {
+                "id": f"trigger-sample-{index}",
+                "locale": "en",
+                "prompt": f"Add this to WATCHLIST.md for sample {index}.",
+                "expected": "trigger",
+                "reason": "explicit_watchlist_add",
+            }
+            for index in range(9)
+        ]
+        cases.extend(
+            {
+                "id": f"no-trigger-sample-{index}",
+                "locale": "en",
+                "prompt": f"Remind me about sample {index} tomorrow.",
+                "expected": "no_trigger",
+                "reason": "generic_reminder_without_watchlist",
+            }
+            for index in range(9)
+        )
+        cases.extend(
+            [
+                {
+                    "id": "wrong-polarity-trigger",
+                    "locale": "en",
+                    "prompt": "Remind me tomorrow at 9.",
+                    "expected": "trigger",
+                    "reason": "generic_reminder_without_watchlist",
+                },
+                {
+                    "id": "wrong-polarity-no-trigger",
+                    "locale": "en",
+                    "prompt": "Add this to WATCHLIST.md.",
+                    "expected": "no_trigger",
+                    "reason": "explicit_watchlist_add",
+                },
+            ]
+        )
+        errors = []
+
+        SEMANTIC_CASES.validate_trigger_case_list(cases, errors)
+
+        self.assertIn(
+            "wrong-polarity-trigger: reason generic_reminder_without_watchlist must use expected=no_trigger",
+            errors,
+        )
+        self.assertIn(
+            "wrong-polarity-no-trigger: reason explicit_watchlist_add must use expected=trigger",
+            errors,
+        )
+
     def test_false_trigger_semantic_case_validates_optional_must_not_list(self):
         case = {
             "id": "sample-case",
@@ -1115,6 +1214,7 @@ def test_semantic_case_checker_passes(self):
 
         self.assertEqual(result.returncode, 0, result.stderr + result.stdout)
         self.assertIn("Semantic case check passed", result.stdout)
+        self.assertIn("trigger case(s)", result.stdout)
 
     def test_semantic_self_check_parser_supports_single_quoted_prompts(self):
         text = """cases:
diff --git a/evals/trigger_cases.json b/evals/trigger_cases.json
new file mode 100644
index 0000000..51415ea
--- /dev/null
+++ b/evals/trigger_cases.json
@@ -0,0 +1,142 @@
+[
+  {
+    "id": "trigger-explicit-watchlist-add-en",
+    "locale": "en",
+    "prompt": "Add this to WATCHLIST.md. Check PR #25 CI at 17:00.",
+    "expected": "trigger",
+    "reason": "explicit_watchlist_add"
+  },
+  {
+    "id": "trigger-explicit-watchlist-add-kr",
+    "locale": "ko",
+    "prompt": "WATCHLIST.md에 추가해줘. 오늘 17시에 배포 확인.",
+    "expected": "trigger",
+    "reason": "explicit_watchlist_add"
+  },
+  {
+    "id": "trigger-wl-complete-en",
+    "locale": "en",
+    "prompt": "Mark WL-20260614-001 done; CI passed.",
+    "expected": "trigger",
+    "reason": "wl_item_lifecycle_update"
+  },
+  {
+    "id": "trigger-wl-drop-kr",
+    "locale": "ko",
+    "prompt": "WL-20260614-002 드롭 처리해. 더 이상 필요 없어.",
+    "expected": "trigger",
+    "reason": "wl_item_lifecycle_update"
+  },
+  {
+    "id": "trigger-watchlist-review-en",
+    "locale": "en",
+    "prompt": "Review WATCHLIST.md items due today.",
+    "expected": "trigger",
+    "reason": "watchlist_list_review"
+  },
+  {
+    "id": "trigger-watchlist-review-kr",
+    "locale": "ko",
+    "prompt": "WATCHLIST.md 목록만 보여줘.",
+    "expected": "trigger",
+    "reason": "watchlist_list_review"
+  },
+  {
+    "id": "trigger-local-private-watchlist-en",
+    "locale": "en",
+    "prompt": "Add this to watchlist as a local note. Check test logs at 18:00.",
+    "expected": "trigger",
+    "reason": "local_private_watchlist_record"
+  },
+  {
+    "id": "trigger-ambiguous-target-en",
+    "locale": "en",
+    "prompt": "Add this to watchlist. Check deployment at 17:00.",
+    "expected": "trigger",
+    "reason": "ambiguous_watchlist_target"
+  },
+  {
+    "id": "trigger-preauthorized-workflow-en",
+    "locale": "en",
+    "prompt": "For this repo's watchlist workflow, record the nightly sync check tomorrow.",
+    "expected": "trigger",
+    "reason": "preauthorized_watchlist_workflow"
+  },
+  {
+    "id": "trigger-watchlist-scoped-pending-en",
+    "locale": "en",
+    "prompt": "The WATCHLIST follow-up for PR review is pending; record checking it after CI completes.",
+    "expected": "trigger",
+    "reason": "watchlist_scoped_pending_result"
+  },
+  {
+    "id": "no-trigger-generic-reminder-en",
+    "locale": "en",
+    "prompt": "Remind me tomorrow at 9 to check deployment.",
+    "expected": "no_trigger",
+    "reason": "generic_reminder_without_watchlist"
+  },
+  {
+    "id": "no-trigger-generic-reminder-kr",
+    "locale": "ko",
+    "prompt": "내일 9시에 배포 확인하라고 리마인드해줘.",
+    "expected": "no_trigger",
+    "reason": "generic_reminder_without_watchlist"
+  },
+  {
+    "id": "no-trigger-now-check-en",
+    "locale": "en",
+    "prompt": "Check the GitHub Actions result now.",
+    "expected": "no_trigger",
+    "reason": "generic_now_check_without_watchlist"
+  },
+  {
+    "id": "no-trigger-now-check-kr",
+    "locale": "ko",
+    "prompt": "지금 배포 결과 확인해줘.",
+    "expected": "no_trigger",
+    "reason": "generic_now_check_without_watchlist"
+  },
+  {
+    "id": "no-trigger-generic-delete-en",
+    "locale": "en",
+    "prompt": "Delete README.md.",
+    "expected": "no_trigger",
+    "reason": "generic_delete_without_watchlist"
+  },
+  {
+    "id": "no-trigger-generic-complete-en",
+    "locale": "en",
+    "prompt": "The refactor is complete.",
+    "expected": "no_trigger",
+    "reason": "generic_lifecycle_without_watchlist"
+  },
+  {
+    "id": "no-trigger-generic-cancel-kr",
+    "locale": "ko",
+    "prompt": "방금 말한 작업 취소해줘.",
+    "expected": "no_trigger",
+    "reason": "generic_lifecycle_without_watchlist"
+  },
+  {
+    "id": "no-trigger-non-watchlist-wl-en",
+    "locale": "en",
+    "prompt": "The WL-ABC warehouse label is obsolete; delete it.",
+    "expected": "no_trigger",
+    "reason": "non_watchlist_wl_text"
+  },
+  {
+    "id": "no-trigger-scheduler-without-watchlist-en",
+    "locale": "en",
+    "prompt": "Schedule a calendar reminder to review CI tomorrow.",
+    "expected": "no_trigger",
+    "reason": "scheduler_without_watchlist"
+  },
+  {
+    "id": "no-trigger-secret-storage-without-watchlist-en",
+    "locale": "en",
+    "prompt": "Store my token abc123 and use it later.",
+    "expected": "no_trigger",
+    "reason": "secret_storage_without_watchlist"
+  }
+]

From 41760de7465f52e0be91b771396605221e89cf2e Mon Sep 17 00:00:00 2001
From: Injae <15044917+dd3ok@users.noreply.github.com>
Date: Mon, 15 Jun 2026 19:34:48 +0900
Subject: [PATCH 2/2] Validate trigger case ids

---
 evals/check_semantic_cases.py | 10 ++++++-
 evals/test_check_watchlist.py | 49 +++++++++++++++++++++++++++++++++++
 2 files changed, 58 insertions(+), 1 deletion(-)

diff --git a/evals/check_semantic_cases.py b/evals/check_semantic_cases.py
index 056fef9..1836694 100644
--- a/evals/check_semantic_cases.py
+++ b/evals/check_semantic_cases.py
@@ -719,7 +719,15 @@ def validate_trigger_case_list(cases: object, errors: list[str]) -> int:
         if not TRIGGER_CASE_KEYS.issubset(case):
             continue
 
-        case_id = str(case["id"])
+        case_id_val = case.get("id")
+        if not isinstance(case_id_val, str) or not case_id_val.strip():
+            errors.append(f"{case_id}: id must be a non-empty string")
+            continue
+        if case_id_val != case_id_val.strip():
+            errors.append(f"{case_id}: id must not have leading or trailing whitespace")
+            continue
+
+        case_id = case_id_val
         if case_id in seen_ids:
             errors.append(f"{case_id}: duplicate trigger case id")
         seen_ids.add(case_id)
diff --git a/evals/test_check_watchlist.py b/evals/test_check_watchlist.py
index 63e97d5..91ab810 100644
--- a/evals/test_check_watchlist.py
+++ b/evals/test_check_watchlist.py
@@ -1162,6 +1162,55 @@ def test_trigger_case_validation_rejects_reason_polarity_drift(self):
             errors,
         )
 
+    def test_trigger_case_validation_rejects_invalid_id(self):
+        cases = [
+            {
+                "id": f"trigger-sample-{index}",
+                "locale": "en",
+                "prompt": f"Add this to WATCHLIST.md for sample {index}.",
+                "expected": "trigger",
+                "reason": "explicit_watchlist_add",
+            }
+            for index in range(9)
+        ]
+        cases.extend(
+            {
+                "id": f"no-trigger-sample-{index}",
+                "locale": "en",
+                "prompt": f"Remind me about sample {index} tomorrow.",
+                "expected": "no_trigger",
+                "reason": "generic_reminder_without_watchlist",
+            }
+            for index in range(9)
+        )
+        cases.extend(
+            [
+                {
+                    "id": 123,
+                    "locale": "en",
+                    "prompt": "Add this to WATCHLIST.md.",
+                    "expected": "trigger",
+                    "reason": "explicit_watchlist_add",
+                },
+                {
+                    "id": " whitespace-id ",
+                    "locale": "en",
+                    "prompt": "Remind me tomorrow at 9.",
+                    "expected": "no_trigger",
+                    "reason": "generic_reminder_without_watchlist",
+                },
+            ]
+        )
+        errors = []
+
+        SEMANTIC_CASES.validate_trigger_case_list(cases, errors)
+
+        self.assertIn("trigger_cases[18]: id must be a non-empty string", errors)
+        self.assertIn(
+            "trigger_cases[19]: id must not have leading or trailing whitespace",
+            errors,
+        )
+
     def test_false_trigger_semantic_case_validates_optional_must_not_list(self):
         case = {
             "id": "sample-case",