diff --git a/.github/actions/ai_summary/job/README.md b/.github/actions/ai_summary/job/README.md index f5da34d..726bc9e 100644 --- a/.github/actions/ai_summary/job/README.md +++ b/.github/actions/ai_summary/job/README.md @@ -55,6 +55,17 @@ The action takes inline JSON — no separate config file. Required fields: `input_dirs`/`output_dir` pass through unchanged. - `input_dirs` — directories with `.log` files to analyze. - `output_dir` — where to write the per-job summary. +- `log_complete_marker` (optional override) — regex for the completion + marker the caller's test wrapper appends as the final log line. Bundled + default matches `[==tt-log-finish-line==]`, with an optional + `exit_code=N` payload (group 1). Marker absent means the step's shell was + hard-killed — the GitHub `timeout-minutes` kill, invisible in the log + itself — so a clean-looking log classifies as TIMEOUT instead of a false + SUCCESS. When the log already shows a crash/failure, that status wins and + the truncation is reported as a possibly independent issue + (`_job.log_complete: false` in the JSON). Callers whose wrapper does not + write the marker must pass `"log_complete_marker": null`, otherwise every + passing job misclassifies as TIMEOUT. Categories, layers, and analysis patterns come from the bundled `analysis.yaml` shipped with the tool. diff --git a/.github/actions/ai_summary/tool/ai_job_summary/cli.py b/.github/actions/ai_summary/tool/ai_job_summary/cli.py index 4623530..eb6c8ba 100644 --- a/.github/actions/ai_summary/tool/ai_job_summary/cli.py +++ b/.github/actions/ai_summary/tool/ai_job_summary/cli.py @@ -105,6 +105,7 @@ def _build_json( "exit_code": extracted_log.exit_code, "has_crash": extracted_log.has_crash, "has_timeout": extracted_log.has_timeout, + "log_complete": extracted_log.log_complete, "failed_tests": extracted_log.failed_tests, } ) @@ -176,6 +177,7 @@ def main(): test_patterns = { "test_result_patterns": config.get("test_patterns", []), "failed_test_patterns": config.get("failed_test_patterns", []), + "log_complete_marker": config.get("log_complete_marker"), } # workspace anchors relative paths in input_dirs/output_dir; absolute @@ -246,6 +248,12 @@ def _timeout_handler(signum, frame): job_status = get_job_status(extracted) print(f"Status: {job_status.status_text}", file=sys.stderr) + # "Would be SUCCESS if not for the absent completion marker." Lets the + # missing-dir infra paths below win over a marker-absence TIMEOUT. + clean_apart_from_marker = job_status.is_success or ( + job_status.status_text == "TIMEOUT" and extracted.log_complete is False + ) + # Success + no missing dirs → done if job_status.is_success and not is_infra_failure: summary = FailureSummary() @@ -272,8 +280,41 @@ def _timeout_handler(signum, frame): ) return + # Clean log, marker absent, all dirs present → GitHub timeout-minutes + # kill. Authoritative; no LLM needed. + if ( + job_status.status_text == "TIMEOUT" + and extracted.log_complete is False + and not is_infra_failure + and not extracted.error_sections + ): + summary = FailureSummary() + summary.category = "infra:timeout" + context = CIContext() + md = format_summary_markdown( + summary, + context, + job_status, + extracted_log=extracted, + job_name=job_name, + job_url=job_url, + ) + _write_outputs( + output_dir, + job_id, + md, + _build_json( + summary, + job_status, + extracted, + job_name=job_name, + job_url=job_url, + ), + ) + return + # Partial dirs + no errors → INFRA_FAILURE without LLM - if job_status.is_success and is_infra_failure and not extracted.error_sections: + if clean_apart_from_marker and is_infra_failure and not extracted.error_sections: summary = FailureSummary() summary.category = "infra:partial_logs" summary.root_cause = f"Missing log dirs: {missing_dirs}" diff --git a/.github/actions/ai_summary/tool/ai_job_summary/config.py b/.github/actions/ai_summary/tool/ai_job_summary/config.py index deeb791..672bc08 100644 --- a/.github/actions/ai_summary/tool/ai_job_summary/config.py +++ b/.github/actions/ai_summary/tool/ai_job_summary/config.py @@ -79,6 +79,7 @@ def load_config(project: dict | None = None) -> dict: config.setdefault("categories", {}) config.setdefault("test_patterns", []) config.setdefault("failed_test_patterns", []) + config.setdefault("log_complete_marker", None) config.setdefault("detection_patterns", {}) config.setdefault("repos", {"default_branches": ["main", "master", "dev"]}) diff --git a/.github/actions/ai_summary/tool/ai_job_summary/config/analysis.yaml b/.github/actions/ai_summary/tool/ai_job_summary/config/analysis.yaml index 0a3e224..1d78f48 100644 --- a/.github/actions/ai_summary/tool/ai_job_summary/config/analysis.yaml +++ b/.github/actions/ai_summary/tool/ai_job_summary/config/analysis.yaml @@ -3,6 +3,14 @@ # Consumers extend via analysis_config: in their workflow YAML, and pick # the LLM model via the per-project config file's `model:` field. +# Regex for the completion marker the caller's wrapper appends as the final +# log line, e.g. '[==tt-log-finish-line==] exit_code=0'. Token presence is the +# contract; the exit_code payload (group 1) is optional. Marker absent => +# shell was hard-killed (GitHub timeout-minutes) => TIMEOUT instead of false +# SUCCESS. Callers whose wrapper does not write the marker must override +# with null in their per-project config. +log_complete_marker: '^\[==tt-log-finish-line==\]\s*(?:exit_code=(\d+))?' + # Regexes scanned over the full log to set has_crash / has_timeout. Project # configs extend each group additively. Case-sensitivity is per group (see # the tool): crash/timeout match case-insensitively; python/killed do not. diff --git a/.github/actions/ai_summary/tool/ai_job_summary/extract.py b/.github/actions/ai_summary/tool/ai_job_summary/extract.py index 4cbf4ae..5a9d2e5 100644 --- a/.github/actions/ai_summary/tool/ai_job_summary/extract.py +++ b/.github/actions/ai_summary/tool/ai_job_summary/extract.py @@ -89,6 +89,8 @@ class ExtractedLog: exit_code: int | None = None has_crash: bool = False # TT_FATAL, panic, etc. has_timeout: bool = False + # None = marker not configured; False = marker absent (shell hard-killed) + log_complete: bool | None = None failed_tests: list[str] = field(default_factory=list) # Test failures (features missing) failed_evals: list[str] = field(default_factory=list) # Eval failures (accuracy below target) @@ -183,6 +185,11 @@ def get_job_status(extracted_log: ExtractedLog) -> JobStatus: return JobStatus(False, "ORANGE", f"TESTS FAILED ({len(extracted_log.failed_tests)} failed)") if extracted_log.failed_evals: return JobStatus(False, "YELLOW", f"EVALS BELOW TARGET ({len(extracted_log.failed_evals)} failed)") + # Marker absent on an otherwise clean log = GitHub timeout-minutes kill. + # Crash/failure branches above win: truncation is then reported as a + # possibly separate issue, not as the status. + if extracted_log.log_complete is False: + return JobStatus(False, "RED", "TIMEOUT") return JobStatus(True, "GREEN", "SUCCESS") @@ -414,6 +421,19 @@ def _any(groups: list[str], flags=0) -> bool: if match := re.search(r"Process completed with exit code (\d+)", full_text): result.exit_code = int(match.group(1)) + # Completion marker: the caller's wrapper appends it as the log's final + # line; absence means the shell was hard-killed (GitHub timeout-minutes), + # which leaves no trace in the tee'd log itself. + marker_pattern = (test_patterns or {}).get("log_complete_marker") + if marker_pattern: + if match := re.search(marker_pattern, full_text, re.MULTILINE): + result.log_complete = True + # group 1 (optional) = wrapped command's exit code + if match.groups() and match.group(1) is not None and match.group(1).isdigit(): + result.exit_code = int(match.group(1)) + else: + result.log_complete = False + # Use provided test patterns or default to empty if test_patterns is None: test_patterns = {} @@ -920,6 +940,17 @@ def format_extracted_log(extracted: ExtractedLog) -> str: for name, cfg in list(lc.framework.items())[:5]: parts.append(f" {name}: {cfg.value}") + if extracted.log_complete is False: + parts.append("\n" + "=" * 60) + parts.append("INCOMPLETE LOG") + parts.append("=" * 60) + parts.append( + "The log ended without its completion marker: the step was killed " + "by the GitHub timeout-minutes limit (test ran too long or hung). " + "Any errors below may be a separate, independent issue — do not " + "assume they caused the timeout." + ) + parts.append("\n" + "=" * 60) parts.append(f"ERROR SECTIONS ({len(extracted.error_sections)} found)") parts.append("=" * 60) diff --git a/.github/actions/ai_summary/tool/ai_job_summary/summarize.py b/.github/actions/ai_summary/tool/ai_job_summary/summarize.py index b17032d..b20b1be 100644 --- a/.github/actions/ai_summary/tool/ai_job_summary/summarize.py +++ b/.github/actions/ai_summary/tool/ai_job_summary/summarize.py @@ -386,6 +386,12 @@ def format_summary_markdown( else: md = f"### {_emoji(job_status.status_code)} {job_status.status_text}\n" + if extracted_log and extracted_log.log_complete is False: + if job_status.status_text == "TIMEOUT": + md += "⚠️ Log ended without its completion marker — step killed by GitHub `timeout-minutes` (test ran too long or hung).\n" + else: + md += "⚠️ Log is incomplete — step killed by GitHub `timeout-minutes` after the failure below; possibly an independent hang in a later test.\n" + # Show problematic layer if different from error layer layer_info = f"| **Error Layer** | `{summary.layer}` |" if summary.problematic_layer and summary.problematic_layer != summary.layer: diff --git a/.github/actions/ai_summary/tool/ai_job_summary/tests/test_cli.py b/.github/actions/ai_summary/tool/ai_job_summary/tests/test_cli.py index 0f61b14..cb24dcd 100644 --- a/.github/actions/ai_summary/tool/ai_job_summary/tests/test_cli.py +++ b/.github/actions/ai_summary/tool/ai_job_summary/tests/test_cli.py @@ -120,14 +120,14 @@ def test_empty(self): class TestResolveLogDirs: def test_all_present(self, tmp_path): - _write_log(tmp_path / "run_logs", "run.log", "some log content\n") + _write_log(tmp_path / "run_logs", "run.log", "some log content\n[==tt-log-finish-line==] exit_code=0\n") _write_log(tmp_path / "docker_server", "server.log", "server log\n") present, missing = _resolve_log_dirs(["run_logs", "docker_server"], tmp_path) assert len(present) == 2 assert missing == [] def test_one_missing(self, tmp_path): - _write_log(tmp_path / "run_logs", "run.log", "some log content\n") + _write_log(tmp_path / "run_logs", "run.log", "some log content\n[==tt-log-finish-line==] exit_code=0\n") present, missing = _resolve_log_dirs(["run_logs", "docker_server"], tmp_path) assert len(present) == 1 assert missing == ["docker_server"] @@ -145,7 +145,7 @@ def test_all_missing(self, tmp_path): def test_absolute_log_dirs_ignore_base(self, tmp_path): # Pathlib: Path(base) / Path(absolute) returns the absolute. - _write_log(tmp_path / "logs", "run.log", "ok\n") + _write_log(tmp_path / "logs", "run.log", "ok\n[==tt-log-finish-line==] exit_code=0\n") present, missing = _resolve_log_dirs([str(tmp_path / "logs")], Path("/nonexistent")) assert len(present) == 1 assert missing == [] @@ -215,7 +215,7 @@ class TestWorkspace: """workspace field anchors relative input_dirs and output_dir.""" def test_relative_input_dirs_resolve_against_workspace(self, tmp_path): - _write_log(tmp_path / "logs", "run.log", "INFO: ok\n") + _write_log(tmp_path / "logs", "run.log", "INFO: ok\n[==tt-log-finish-line==] exit_code=0\n") config = json.dumps( { "model": "test", @@ -228,7 +228,7 @@ def test_relative_input_dirs_resolve_against_workspace(self, tmp_path): assert any(tmp_path.glob("ai_job_summary*.md")) def test_relative_output_dir_resolves_against_workspace(self, tmp_path): - _write_log(tmp_path / "logs", "run.log", "INFO: ok\n") + _write_log(tmp_path / "logs", "run.log", "INFO: ok\n[==tt-log-finish-line==] exit_code=0\n") config = json.dumps( { "model": "test", @@ -245,7 +245,7 @@ def test_env_vars_in_paths_are_expanded(self, tmp_path, monkeypatch): # This is the only way to express the in-container workspace path # for container jobs (the ${{ github.workspace }} expression # always resolves to the host path). - _write_log(tmp_path / "logs", "run.log", "INFO: ok\n") + _write_log(tmp_path / "logs", "run.log", "INFO: ok\n[==tt-log-finish-line==] exit_code=0\n") monkeypatch.setenv("GITHUB_WORKSPACE", str(tmp_path)) config = json.dumps( { @@ -266,7 +266,7 @@ class TestDualOutput: """Tool always writes both .md and .json.""" def test_writes_both_files(self, tmp_path): - _write_log(tmp_path / "run_logs", "run.log", "INFO: all good\n") + _write_log(tmp_path / "run_logs", "run.log", "INFO: all good\n[==tt-log-finish-line==] exit_code=0\n") _write_log(tmp_path / "docker_server", "server.log", "INFO: server ready\n") config = _config_json(tmp_path, ["run_logs", "docker_server"]) _run_cli(["--config", config]) @@ -274,7 +274,7 @@ def test_writes_both_files(self, tmp_path): assert any(tmp_path.glob("*.json")) def test_json_has_job_block(self, tmp_path): - _write_log(tmp_path / "run_logs", "run.log", "INFO: all good\n") + _write_log(tmp_path / "run_logs", "run.log", "INFO: all good\n[==tt-log-finish-line==] exit_code=0\n") _write_log(tmp_path / "docker_server", "server.log", "INFO: server ready\n") config = _config_json(tmp_path, ["run_logs", "docker_server"]) _run_cli(["--config", config]) @@ -283,7 +283,7 @@ def test_json_has_job_block(self, tmp_path): assert "status" in data["_job"] def test_filename_uses_job_id_from_url(self, tmp_path): - _write_log(tmp_path / "run_logs", "run.log", "INFO: all good\n") + _write_log(tmp_path / "run_logs", "run.log", "INFO: all good\n[==tt-log-finish-line==] exit_code=0\n") _write_log(tmp_path / "docker_server", "server.log", "INFO: server ready\n") config = _config_json(tmp_path, ["run_logs", "docker_server"]) _run_cli(["--config", config, "--job-url", "https://github.com/org/repo/actions/runs/1/job/99999"]) @@ -292,7 +292,7 @@ def test_filename_uses_job_id_from_url(self, tmp_path): def test_fallback_filename_without_job_url(self, tmp_path): """When no --job-url, files named ai_job_summary.md/.json (no ID suffix).""" - _write_log(tmp_path / "run_logs", "run.log", "INFO: all good\n") + _write_log(tmp_path / "run_logs", "run.log", "INFO: all good\n[==tt-log-finish-line==] exit_code=0\n") _write_log(tmp_path / "docker_server", "server.log", "INFO: server ready\n") config = _config_json(tmp_path, ["run_logs", "docker_server"]) _run_cli(["--config", config]) @@ -517,6 +517,7 @@ def success_dir(self, tmp_path): 2026-03-15 09:00:01,000 - server.py:50 - INFO: Server started on port 8000 2026-03-15 09:01:30,500 - run_workflows.py:101 - INFO: workflow: benchmarks completed with return code: 0 2026-03-15 09:02:46,000 - run.py:550 - INFO: All workflows completed successfully + [==tt-log-finish-line==] exit_code=0 """ ), ) @@ -542,6 +543,49 @@ def test_job_name_in_json(self, success_dir): assert data["_job"]["name"] == "Llama-3.1-8B (n150)" +class TestTimeoutMarkerAbsent: + """Clean log without the completion marker → GitHub timeout-minutes kill.""" + + def test_clean_truncated_log_is_timeout_without_llm(self, tmp_path): + _write_log(tmp_path / "logs", "run.log", "INFO: running test_a\nINFO: running test_b\n") + config = _config_json(tmp_path, ["logs"]) + with patch("ai_job_summary.cli.get_llm_client") as mock_llm: + _run_cli(["--config", config]) + mock_llm.assert_not_called() + data = json.loads(_read_summary(tmp_path, ".json")) + assert data["_job"]["status"] == "TIMEOUT" + assert data["_job"]["log_complete"] is False + + def test_marker_present_clean_log_is_success(self, tmp_path): + _write_log(tmp_path / "logs", "run.log", "INFO: ok\n[==tt-log-finish-line==] exit_code=0\n") + config = _config_json(tmp_path, ["logs"]) + with patch("ai_job_summary.cli.get_llm_client") as mock_llm: + _run_cli(["--config", config]) + mock_llm.assert_not_called() + data = json.loads(_read_summary(tmp_path, ".json")) + assert data["_job"]["status"] == "SUCCESS" + + def test_missing_dir_wins_over_absent_marker(self, tmp_path): + # present log markerless + a dir missing → INFRA partial_logs, not TIMEOUT + _write_log(tmp_path / "run_logs", "run.log", "INFO: some log content\n") + config = _config_json(tmp_path, ["run_logs", "docker_server"]) + with patch("ai_job_summary.cli.get_llm_client") as mock_llm: + _run_cli(["--config", config]) + mock_llm.assert_not_called() + data = json.loads(_read_summary(tmp_path, ".json")) + assert data["_job"]["status"] == "INFRA FAILURE" + assert data["category"] == "infra:partial_logs" + + def test_marker_disabled_keeps_clean_log_success(self, tmp_path): + _write_log(tmp_path / "logs", "run.log", "INFO: ok\n") + config = _config_json(tmp_path, ["logs"], log_complete_marker=None) + with patch("ai_job_summary.cli.get_llm_client") as mock_llm: + _run_cli(["--config", config]) + mock_llm.assert_not_called() + data = json.loads(_read_summary(tmp_path, ".json")) + assert data["_job"]["status"] == "SUCCESS" + + # ── Crash: LLM overrides extraction (using real fixture + mock response) ────── diff --git a/.github/actions/ai_summary/tool/ai_job_summary/tests/test_log_complete.py b/.github/actions/ai_summary/tool/ai_job_summary/tests/test_log_complete.py new file mode 100644 index 0000000..91e3a20 --- /dev/null +++ b/.github/actions/ai_summary/tool/ai_job_summary/tests/test_log_complete.py @@ -0,0 +1,137 @@ +# SPDX-FileCopyrightText: (c) 2026 Tenstorrent AI ULC +# +# SPDX-License-Identifier: Apache-2.0 +""" +Tests for the log-completion marker (log_complete_marker config). + +The caller's test wrapper appends '[==tt-log-finish-line==] exit_code=N' as the +final log line. Marker absent => shell was hard-killed (GitHub timeout-minutes) +=> TIMEOUT instead of a false SUCCESS. A crash/failure already in the log wins +over the marker verdict. +""" + +from ai_job_summary.config import load_config +from ai_job_summary.extract import ExtractedLog, JobStatus, extract_log, get_job_status +from ai_job_summary.summarize import FailureSummary, format_summary_markdown +from ai_job_summary.context import CIContext + +MARKER_REGEX = r"^\[==tt-log-finish-line==\]\s*(?:exit_code=(\d+))?" +PATTERNS = {"log_complete_marker": MARKER_REGEX} + +CLEAN_LINES = [ + "2026-06-11T00:42:13 pytest session starts", + "test_a PASSED", + "test_b PASSED", +] +CRASH_LINES = [ + "2026-06-11T00:42:13 pytest session starts", + "test_a PASSED", + "TT_FATAL: device hung on op dispatch", +] +FINISH_OK = "[==tt-log-finish-line==] exit_code=0" +FINISH_FAIL = "[==tt-log-finish-line==] exit_code=2" + + +def _extract(tmp_path, lines, test_patterns=PATTERNS): + log = tmp_path / "test.log" + log.write_text("\n".join(lines) + "\n") + return extract_log(log, test_patterns=test_patterns) + + +class TestMarkerExtraction: + def test_marker_present_sets_log_complete(self, tmp_path): + e = _extract(tmp_path, CLEAN_LINES + [FINISH_OK]) + assert e.log_complete is True + + def test_marker_carries_exit_code(self, tmp_path): + e = _extract(tmp_path, CLEAN_LINES + [FINISH_OK]) + assert e.exit_code == 0 + + def test_marker_nonzero_exit_code(self, tmp_path): + e = _extract(tmp_path, CLEAN_LINES + [FINISH_FAIL]) + assert e.exit_code == 2 + + def test_marker_absent_sets_log_complete_false(self, tmp_path): + e = _extract(tmp_path, CLEAN_LINES) + assert e.log_complete is False + + def test_marker_not_configured_leaves_none(self, tmp_path): + e = _extract(tmp_path, CLEAN_LINES, test_patterns={}) + assert e.log_complete is None + + def test_bare_token_without_payload_counts(self, tmp_path): + e = _extract(tmp_path, CLEAN_LINES + ["[==tt-log-finish-line==]"]) + assert e.log_complete is True + assert e.exit_code is None + + def test_bundled_default_matches_wrapper_line(self, tmp_path): + marker = load_config()["log_complete_marker"] + e = _extract(tmp_path, CLEAN_LINES + [FINISH_FAIL], test_patterns={"log_complete_marker": marker}) + assert e.log_complete is True + assert e.exit_code == 2 + + def test_marker_must_anchor_line_start(self, tmp_path): + # embedded in other output (e.g. echoed cmd) must not count + lines = CLEAN_LINES + ["echo [==tt-log-finish-line==] exit_code=0"] + e = _extract(tmp_path, lines) + assert e.log_complete is False + + +class TestStatusWithMarker: + def test_clean_truncated_log_is_timeout(self, tmp_path): + e = _extract(tmp_path, CLEAN_LINES) + s = get_job_status(e) + assert s.status_text == "TIMEOUT" + assert not s.is_success + + def test_clean_complete_log_is_success(self, tmp_path): + e = _extract(tmp_path, CLEAN_LINES + [FINISH_OK]) + s = get_job_status(e) + assert s.status_text == "SUCCESS" + + def test_crash_wins_over_truncation(self, tmp_path): + e = _extract(tmp_path, CRASH_LINES) + assert e.log_complete is False + s = get_job_status(e) + assert s.status_text == "CRASHED" + + def test_nonzero_marker_exit_code_is_failure(self, tmp_path): + e = _extract(tmp_path, CLEAN_LINES + [FINISH_FAIL]) + s = get_job_status(e) + assert "FAILED" in s.status_text + + def test_legacy_log_without_config_is_success(self, tmp_path): + e = _extract(tmp_path, CLEAN_LINES, test_patterns={}) + s = get_job_status(e) + assert s.status_text == "SUCCESS" + + +class TestReportNote: + def _md(self, log_complete, status): + e = ExtractedLog() + e.log_complete = log_complete + return format_summary_markdown(FailureSummary(), CIContext(), status, extracted_log=e) + + def test_timeout_note(self): + md = self._md(False, JobStatus(False, "RED", "TIMEOUT")) + assert "completion marker" in md + assert "timeout-minutes" in md + + def test_independent_issue_note_on_crash(self): + md = self._md(False, JobStatus(False, "RED", "CRASHED")) + assert "incomplete" in md + assert "independent" in md + + def test_no_note_when_complete(self): + md = self._md(True, JobStatus(True, "GREEN", "SUCCESS")) + assert "incomplete" not in md + assert "completion marker" not in md + + +class TestConfig: + def test_bundled_default_is_set(self): + assert "tt-log-finish-line" in load_config()["log_complete_marker"] + + def test_project_overlay_can_disable(self): + cfg = load_config({"log_complete_marker": None}) + assert cfg["log_complete_marker"] is None