dd3ok · dd3ok · Jun 15, 2026 · Jun 15, 2026 · Jun 15, 2026
diff --git a/docs/runtime-smoke.md b/docs/runtime-smoke.md
@@ -1,6 +1,6 @@
 # Runtime Smoke Matrix
 
-This file tracks manual smoke checks in real agent runtimes. Record only real runtime results; do not mark a row as pass based on README guidance or CI alone.
+This file tracks manual smoke checks in real agent runtimes. Record only real runtime results; do not mark a row as pass based on README guidance or CI alone. Do not store transcripts, screenshots, raw logs, or long runtime output.
 
 ## Matrix
 

diff --git a/evals/check_semantic_cases.py b/evals/check_semantic_cases.py
@@ -17,6 +17,7 @@
 PROMPTS_CSV = ROOT / "evals" / "prompts.csv"
 SELF_CHECKS = ROOT / "evals" / "self_checks.yaml"
 CHECK_WATCHLIST = ROOT / "evals" / "check_watchlist.py"
+TRIGGER_CASES = ROOT / "evals" / "trigger_cases.json"
 
 AUTONOMOUS_REMINDER_FORBIDDEN = {
     "I'll remind you",
@@ -62,6 +63,48 @@
     "storage-policy",
     "agent-workflow-safety",
 }
+SUPPORTED_TRIGGER_REASONS = {
+    "ambiguous_watchlist_target",
+    "explicit_watchlist_add",
+    "generic_delete_without_watchlist",
+    "generic_lifecycle_without_watchlist",
+    "generic_now_check_without_watchlist",
+    "generic_reminder_without_watchlist",
+    "local_private_watchlist_record",
+    "non_watchlist_wl_text",
+    "preauthorized_watchlist_workflow",
+    "scheduler_without_watchlist",
+    "secret_storage_without_watchlist",
+    "watchlist_list_review",
+    "watchlist_scoped_pending_result",
+    "wl_item_lifecycle_update",
+}
+REQUIRED_TRIGGER_REASONS = {
+    "explicit_watchlist_add",
+    "wl_item_lifecycle_update",
+    "watchlist_list_review",
+    "generic_reminder_without_watchlist",
+    "generic_now_check_without_watchlist",
+    "generic_lifecycle_without_watchlist",
+    "non_watchlist_wl_text",
+}
+TRIGGER_CASE_KEYS = {"id", "locale", "prompt", "expected", "reason"}
+TRIGGER_REASON_EXPECTED = {
+    "ambiguous_watchlist_target": "trigger",
+    "explicit_watchlist_add": "trigger",
+    "generic_delete_without_watchlist": "no_trigger",
+    "generic_lifecycle_without_watchlist": "no_trigger",
+    "generic_now_check_without_watchlist": "no_trigger",
+    "generic_reminder_without_watchlist": "no_trigger",
+    "local_private_watchlist_record": "trigger",
+    "non_watchlist_wl_text": "no_trigger",
+    "preauthorized_watchlist_workflow": "trigger",
+    "scheduler_without_watchlist": "no_trigger",
+    "secret_storage_without_watchlist": "no_trigger",
+    "watchlist_list_review": "trigger",
+    "watchlist_scoped_pending_result": "trigger",
+    "wl_item_lifecycle_update": "trigger",
+}
 
 
 def fail(message: str) -> int:
@@ -652,6 +695,97 @@ def validate_case(
     validate_storage_contract(case_id, case, expected, errors)
 
 
+def validate_trigger_case_list(cases: object, errors: list[str]) -> int:
+    if not isinstance(cases, list):
+        errors.append("trigger_cases.json: root value must be a list")
+        return 0
+    if not 20 <= len(cases) <= 30:
+        errors.append("trigger_cases.json: expected 20 to 30 lightweight cases")
+
+    seen_ids: set[str] = set()
+    decisions = {"trigger": 0, "no_trigger": 0}
+    reasons: set[str] = set()
+    for index, case in enumerate(cases):
+        case_id = f"trigger_cases[{index}]"
+        if not isinstance(case, dict):
+            errors.append(f"{case_id}: case must be an object")
+            continue
+
+        extra_keys = sorted(set(case) - TRIGGER_CASE_KEYS)
+        if extra_keys:
+            errors.append(f"{case_id}: unsupported key(s): {', '.join(extra_keys)}")
+
+        require_keys(case, TRIGGER_CASE_KEYS, case_id, errors, "trigger case")
+        if not TRIGGER_CASE_KEYS.issubset(case):
+            continue
+
+        case_id_val = case.get("id")
+        if not isinstance(case_id_val, str) or not case_id_val.strip():
+            errors.append(f"{case_id}: id must be a non-empty string")
+            continue
+        if case_id_val != case_id_val.strip():
+            errors.append(f"{case_id}: id must not have leading or trailing whitespace")
+            continue
+
+        case_id = case_id_val
+        if case_id in seen_ids:
+            errors.append(f"{case_id}: duplicate trigger case id")
+        seen_ids.add(case_id)
+
+        if case.get("locale") not in {"ko", "en", "mixed"}:
+            errors.append(f"{case_id}: locale must be ko, en, or mixed")
+
+        prompt = case.get("prompt")
+        if not isinstance(prompt, str) or not prompt.strip():
+            errors.append(f"{case_id}: prompt must be a non-empty string")
+        elif len(prompt) > 180:
+            errors.append(f"{case_id}: prompt is too long for lightweight trigger eval")
+
+        expected = case.get("expected")
+        if expected not in decisions:
+            errors.append(f"{case_id}: expected must be trigger or no_trigger")
+        else:
+            decisions[str(expected)] += 1
+
+        reason = case.get("reason")
+        if reason not in SUPPORTED_TRIGGER_REASONS:
+            errors.append(f"{case_id}: unsupported trigger reason: {reason}")
+        else:
+            reasons.add(str(reason))
+            expected_for_reason = TRIGGER_REASON_EXPECTED[str(reason)]
+            if expected != expected_for_reason:
+                errors.append(
+                    f"{case_id}: reason {reason} must use expected={expected_for_reason}"
+                )
+
+    for decision, count in decisions.items():
+        if count < 8:
+            errors.append(f"trigger_cases.json: expected at least 8 {decision} cases")
+
+    missing_reasons = sorted(REQUIRED_TRIGGER_REASONS - reasons)
+    if missing_reasons:
+        errors.append(
+            "trigger_cases.json: missing required trigger reason(s): "
+            + ", ".join(missing_reasons)
+        )
+
+    return len(cases)
+
+
+def validate_trigger_cases(errors: list[str]) -> int:
+    if not TRIGGER_CASES.is_file():
+        errors.append(f"Missing trigger eval corpus: {TRIGGER_CASES.relative_to(ROOT)}")
+        return 0
+
+    try:
+        cases = json.loads(TRIGGER_CASES.read_text(encoding="utf-8"))
+    except json.JSONDecodeError as exc:
+        errors.append(f"trigger_cases.json: invalid JSON: {exc}")
+        return 0
+
+    return validate_trigger_case_list(cases, errors)
+
+
 def main() -> int:
     errors: list[str] = []
     if not CASES_DIR.is_dir():
@@ -689,10 +823,15 @@ def main() -> int:
             f"Missing semantic case(s) for self_checks.yaml: {', '.join(missing_self_check_cases)}"
         )
 
+    trigger_case_count = validate_trigger_cases(errors)
+
     if errors:
         return fail("Semantic case check failed:\n" + "\n".join(f"- {error}" for error in errors))
 
-    print(f"Semantic case check passed: {len(case_paths)} case(s)")
+    print(
+        f"Semantic case check passed: {len(case_paths)} case(s); "
+        f"{trigger_case_count} trigger case(s)"
+    )
     return 0
 
 

diff --git a/evals/test_check_watchlist.py b/evals/test_check_watchlist.py
@@ -21,6 +21,7 @@
 SEMANTIC_SCRIPT = REPO_ROOT / "evals" / "check_semantic_cases.py"
 PACKAGE_SCRIPT = REPO_ROOT / "evals" / "check_skill_package.py"
 REPO_VALIDATOR = REPO_ROOT / "tools" / "validate_watchlist.py"
+TRIGGER_CASES = REPO_ROOT / "evals" / "trigger_cases.json"
 
 _SEMANTIC_SPEC = importlib.util.spec_from_file_location(
     "check_semantic_cases", SEMANTIC_SCRIPT
@@ -941,7 +942,53 @@ def test_runtime_smoke_doc_tracks_pending_vendor_matrix(self):
             self.assertIn(runtime, text)
         self.assertIn("pending", text)
         self.assertIn("Record only real runtime results", text)
+        self.assertIn("Do not store transcripts, screenshots, raw logs, or long runtime output.", text)
         self.assertIn("without a bundled Python validator", text)
+        self.assertLessEqual(len(text.splitlines()), 35)
+
+    def test_trigger_eval_corpus_is_small_balanced_and_deterministic(self):
+        cases = json.loads(TRIGGER_CASES.read_text(encoding="utf-8"))
+
+        self.assertGreaterEqual(len(cases), 20)
+        self.assertLessEqual(len(cases), 30)
+
+        decisions = {case["expected"] for case in cases}
+        self.assertEqual(decisions, {"trigger", "no_trigger"})
+        self.assertGreaterEqual(
+            sum(1 for case in cases if case["expected"] == "trigger"),
+            8,
+        )
+        self.assertGreaterEqual(
+            sum(1 for case in cases if case["expected"] == "no_trigger"),
+            8,
+        )
+
+        reasons = {case["reason"] for case in cases}
+        for reason in [
+            "explicit_watchlist_add",
+            "wl_item_lifecycle_update",
+            "watchlist_list_review",
+            "generic_reminder_without_watchlist",
+            "generic_now_check_without_watchlist",
+            "generic_lifecycle_without_watchlist",
+            "non_watchlist_wl_text",
+        ]:
+            self.assertIn(reason, reasons)
+
+        forbidden_fields = {
+            "actual",
+            "response",
+            "runtime",
+            "runtime_output",
+            "transcript",
+            "screenshot",
+            "logs",
+        }
+        for case in cases:
+            with self.subTest(case=case["id"]):
+                self.assertEqual(set(case), {"id", "locale", "prompt", "expected", "reason"})
+                self.assertFalse(forbidden_fields.intersection(case))
+                self.assertLessEqual(len(case["prompt"]), 180)
 
     def test_starter_templates_label_commented_item_as_example_only(self):
         paths = [
@@ -1063,6 +1110,107 @@ def test_semantic_case_validation_rejects_unknown_category(self):
 
         self.assertIn("sample-case: category is unsupported: workflow-safety", errors)
 
+    def test_trigger_case_validation_rejects_reason_polarity_drift(self):
+        cases = [
+            {
+                "id": f"trigger-sample-{index}",
+                "locale": "en",
+                "prompt": f"Add this to WATCHLIST.md for sample {index}.",
+                "expected": "trigger",
+                "reason": "explicit_watchlist_add",
+            }
+            for index in range(9)
+        ]
+        cases.extend(
+            {
+                "id": f"no-trigger-sample-{index}",
+                "locale": "en",
+                "prompt": f"Remind me about sample {index} tomorrow.",
+                "expected": "no_trigger",
+                "reason": "generic_reminder_without_watchlist",
+            }
+            for index in range(9)
+        )
+        cases.extend(
+            [
+                {
+                    "id": "wrong-polarity-trigger",
+                    "locale": "en",
+                    "prompt": "Remind me tomorrow at 9.",
+                    "expected": "trigger",
+                    "reason": "generic_reminder_without_watchlist",
+                },
+                {
+                    "id": "wrong-polarity-no-trigger",
+                    "locale": "en",
+                    "prompt": "Add this to WATCHLIST.md.",
+                    "expected": "no_trigger",
+                    "reason": "explicit_watchlist_add",
+                },
+            ]
+        )
+        errors = []
+
+        SEMANTIC_CASES.validate_trigger_case_list(cases, errors)
+
+        self.assertIn(
+            "wrong-polarity-trigger: reason generic_reminder_without_watchlist must use expected=no_trigger",
+            errors,
+        )
+        self.assertIn(
+            "wrong-polarity-no-trigger: reason explicit_watchlist_add must use expected=trigger",
+            errors,
+        )
+
+    def test_trigger_case_validation_rejects_invalid_id(self):
+        cases = [
+            {
+                "id": f"trigger-sample-{index}",
+                "locale": "en",
+                "prompt": f"Add this to WATCHLIST.md for sample {index}.",
+                "expected": "trigger",
+                "reason": "explicit_watchlist_add",
+            }
+            for index in range(9)
+        ]
+        cases.extend(
+            {
+                "id": f"no-trigger-sample-{index}",
+                "locale": "en",
+                "prompt": f"Remind me about sample {index} tomorrow.",
+                "expected": "no_trigger",
+                "reason": "generic_reminder_without_watchlist",
+            }
+            for index in range(9)
+        )
+        cases.extend(
+            [
+                {
+                    "id": 123,
+                    "locale": "en",
+                    "prompt": "Add this to WATCHLIST.md.",
+                    "expected": "trigger",
+                    "reason": "explicit_watchlist_add",
+                },
+                {
+                    "id": " whitespace-id ",
+                    "locale": "en",
+                    "prompt": "Remind me tomorrow at 9.",
+                    "expected": "no_trigger",
+                    "reason": "generic_reminder_without_watchlist",
+                },
+            ]
+        )
+        errors = []
+
+        SEMANTIC_CASES.validate_trigger_case_list(cases, errors)
+
+        self.assertIn("trigger_cases[18]: id must be a non-empty string", errors)
+        self.assertIn(
+            "trigger_cases[19]: id must not have leading or trailing whitespace",
+            errors,
+        )
+
     def test_false_trigger_semantic_case_validates_optional_must_not_list(self):
         case = {
             "id": "sample-case",
@@ -1115,6 +1263,7 @@ def test_semantic_case_checker_passes(self):
 
         self.assertEqual(result.returncode, 0, result.stderr + result.stdout)
         self.assertIn("Semantic case check passed", result.stdout)
+        self.assertIn("trigger case(s)", result.stdout)
 
     def test_semantic_self_check_parser_supports_single_quoted_prompts(self):
         text = """cases: