Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions .github/actions/ai_summary/job/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,17 @@ The action takes inline JSON — no separate config file. Required fields:
`input_dirs`/`output_dir` pass through unchanged.
- `input_dirs` — directories with `.log` files to analyze.
- `output_dir` — where to write the per-job summary.
- `log_complete_marker` (optional override) — regex for the completion
marker the caller's test wrapper appends as the final log line. Bundled
default matches `[==tt-log-finish-line==]`, with an optional
`exit_code=N` payload (group 1). Marker absent means the step's shell was
hard-killed — the GitHub `timeout-minutes` kill, invisible in the log
itself — so a clean-looking log classifies as TIMEOUT instead of a false
SUCCESS. When the log already shows a crash/failure, that status wins and
the truncation is reported as a possibly independent issue
(`_job.log_complete: false` in the JSON). Callers whose wrapper does not
write the marker must pass `"log_complete_marker": null`, otherwise every
passing job misclassifies as TIMEOUT.

Categories, layers, and analysis patterns come from the bundled
`analysis.yaml` shipped with the tool.
Expand Down
43 changes: 42 additions & 1 deletion .github/actions/ai_summary/tool/ai_job_summary/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ def _build_json(
"exit_code": extracted_log.exit_code,
"has_crash": extracted_log.has_crash,
"has_timeout": extracted_log.has_timeout,
"log_complete": extracted_log.log_complete,
"failed_tests": extracted_log.failed_tests,
}
)
Expand Down Expand Up @@ -176,6 +177,7 @@ def main():
test_patterns = {
"test_result_patterns": config.get("test_patterns", []),
"failed_test_patterns": config.get("failed_test_patterns", []),
"log_complete_marker": config.get("log_complete_marker"),
}

# workspace anchors relative paths in input_dirs/output_dir; absolute
Expand Down Expand Up @@ -246,6 +248,12 @@ def _timeout_handler(signum, frame):
job_status = get_job_status(extracted)
print(f"Status: {job_status.status_text}", file=sys.stderr)

# "Would be SUCCESS if not for the absent completion marker." Lets the
# missing-dir infra paths below win over a marker-absence TIMEOUT.
clean_apart_from_marker = job_status.is_success or (
job_status.status_text == "TIMEOUT" and extracted.log_complete is False
)

# Success + no missing dirs → done
if job_status.is_success and not is_infra_failure:
summary = FailureSummary()
Expand All @@ -272,8 +280,41 @@ def _timeout_handler(signum, frame):
)
return

# Clean log, marker absent, all dirs present → GitHub timeout-minutes
# kill. Authoritative; no LLM needed.
if (
job_status.status_text == "TIMEOUT"
and extracted.log_complete is False
and not is_infra_failure
and not extracted.error_sections
):
summary = FailureSummary()
summary.category = "infra:timeout"
context = CIContext()
md = format_summary_markdown(
summary,
context,
job_status,
extracted_log=extracted,
job_name=job_name,
job_url=job_url,
)
_write_outputs(
output_dir,
job_id,
md,
_build_json(
summary,
job_status,
extracted,
job_name=job_name,
job_url=job_url,
),
)
return

# Partial dirs + no errors → INFRA_FAILURE without LLM
if job_status.is_success and is_infra_failure and not extracted.error_sections:
if clean_apart_from_marker and is_infra_failure and not extracted.error_sections:
summary = FailureSummary()
summary.category = "infra:partial_logs"
summary.root_cause = f"Missing log dirs: {missing_dirs}"
Expand Down
1 change: 1 addition & 0 deletions .github/actions/ai_summary/tool/ai_job_summary/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ def load_config(project: dict | None = None) -> dict:
config.setdefault("categories", {})
config.setdefault("test_patterns", [])
config.setdefault("failed_test_patterns", [])
config.setdefault("log_complete_marker", None)
config.setdefault("detection_patterns", {})
config.setdefault("repos", {"default_branches": ["main", "master", "dev"]})

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,14 @@
# Consumers extend via analysis_config: in their workflow YAML, and pick
# the LLM model via the per-project config file's `model:` field.

# Regex for the completion marker the caller's wrapper appends as the final
# log line, e.g. '[==tt-log-finish-line==] exit_code=0'. Token presence is the
# contract; the exit_code payload (group 1) is optional. Marker absent =>
# shell was hard-killed (GitHub timeout-minutes) => TIMEOUT instead of false
# SUCCESS. Callers whose wrapper does not write the marker must override
# with null in their per-project config.
log_complete_marker: '^\[==tt-log-finish-line==\]\s*(?:exit_code=(\d+))?'

# Regexes scanned over the full log to set has_crash / has_timeout. Project
# configs extend each group additively. Case-sensitivity is per group (see
# the tool): crash/timeout match case-insensitively; python/killed do not.
Expand Down
31 changes: 31 additions & 0 deletions .github/actions/ai_summary/tool/ai_job_summary/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,8 @@ class ExtractedLog:
exit_code: int | None = None
has_crash: bool = False # TT_FATAL, panic, etc.
has_timeout: bool = False
# None = marker not configured; False = marker absent (shell hard-killed)
log_complete: bool | None = None
failed_tests: list[str] = field(default_factory=list) # Test failures (features missing)
failed_evals: list[str] = field(default_factory=list) # Eval failures (accuracy below target)

Expand Down Expand Up @@ -183,6 +185,11 @@ def get_job_status(extracted_log: ExtractedLog) -> JobStatus:
return JobStatus(False, "ORANGE", f"TESTS FAILED ({len(extracted_log.failed_tests)} failed)")
if extracted_log.failed_evals:
return JobStatus(False, "YELLOW", f"EVALS BELOW TARGET ({len(extracted_log.failed_evals)} failed)")
# Marker absent on an otherwise clean log = GitHub timeout-minutes kill.
# Crash/failure branches above win: truncation is then reported as a
# possibly separate issue, not as the status.
if extracted_log.log_complete is False:
return JobStatus(False, "RED", "TIMEOUT")
Comment on lines +188 to +192
return JobStatus(True, "GREEN", "SUCCESS")


Expand Down Expand Up @@ -414,6 +421,19 @@ def _any(groups: list[str], flags=0) -> bool:
if match := re.search(r"Process completed with exit code (\d+)", full_text):
result.exit_code = int(match.group(1))

# Completion marker: the caller's wrapper appends it as the log's final
# line; absence means the shell was hard-killed (GitHub timeout-minutes),
# which leaves no trace in the tee'd log itself.
marker_pattern = (test_patterns or {}).get("log_complete_marker")
if marker_pattern:
if match := re.search(marker_pattern, full_text, re.MULTILINE):
result.log_complete = True
# group 1 (optional) = wrapped command's exit code
if match.groups() and match.group(1) is not None and match.group(1).isdigit():
result.exit_code = int(match.group(1))
else:
result.log_complete = False
Comment on lines +424 to +435
Comment on lines +427 to +435

# Use provided test patterns or default to empty
if test_patterns is None:
test_patterns = {}
Expand Down Expand Up @@ -920,6 +940,17 @@ def format_extracted_log(extracted: ExtractedLog) -> str:
for name, cfg in list(lc.framework.items())[:5]:
parts.append(f" {name}: {cfg.value}")

if extracted.log_complete is False:
parts.append("\n" + "=" * 60)
parts.append("INCOMPLETE LOG")
parts.append("=" * 60)
parts.append(
"The log ended without its completion marker: the step was killed "
"by the GitHub timeout-minutes limit (test ran too long or hung). "
"Any errors below may be a separate, independent issue — do not "
"assume they caused the timeout."
)

parts.append("\n" + "=" * 60)
parts.append(f"ERROR SECTIONS ({len(extracted.error_sections)} found)")
parts.append("=" * 60)
Expand Down
6 changes: 6 additions & 0 deletions .github/actions/ai_summary/tool/ai_job_summary/summarize.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,12 @@ def format_summary_markdown(
else:
md = f"### {_emoji(job_status.status_code)} {job_status.status_text}\n"

if extracted_log and extracted_log.log_complete is False:
if job_status.status_text == "TIMEOUT":
md += "⚠️ Log ended without its completion marker — step killed by GitHub `timeout-minutes` (test ran too long or hung).\n"
else:
md += "⚠️ Log is incomplete — step killed by GitHub `timeout-minutes` after the failure below; possibly an independent hang in a later test.\n"
Comment on lines +389 to +393

# Show problematic layer if different from error layer
layer_info = f"| **Error Layer** | `{summary.layer}` |"
if summary.problematic_layer and summary.problematic_layer != summary.layer:
Expand Down
64 changes: 54 additions & 10 deletions .github/actions/ai_summary/tool/ai_job_summary/tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,14 +120,14 @@ def test_empty(self):

class TestResolveLogDirs:
def test_all_present(self, tmp_path):
_write_log(tmp_path / "run_logs", "run.log", "some log content\n")
_write_log(tmp_path / "run_logs", "run.log", "some log content\n[==tt-log-finish-line==] exit_code=0\n")
_write_log(tmp_path / "docker_server", "server.log", "server log\n")
present, missing = _resolve_log_dirs(["run_logs", "docker_server"], tmp_path)
assert len(present) == 2
assert missing == []

def test_one_missing(self, tmp_path):
_write_log(tmp_path / "run_logs", "run.log", "some log content\n")
_write_log(tmp_path / "run_logs", "run.log", "some log content\n[==tt-log-finish-line==] exit_code=0\n")
present, missing = _resolve_log_dirs(["run_logs", "docker_server"], tmp_path)
assert len(present) == 1
assert missing == ["docker_server"]
Expand All @@ -145,7 +145,7 @@ def test_all_missing(self, tmp_path):

def test_absolute_log_dirs_ignore_base(self, tmp_path):
# Pathlib: Path(base) / Path(absolute) returns the absolute.
_write_log(tmp_path / "logs", "run.log", "ok\n")
_write_log(tmp_path / "logs", "run.log", "ok\n[==tt-log-finish-line==] exit_code=0\n")
present, missing = _resolve_log_dirs([str(tmp_path / "logs")], Path("/nonexistent"))
assert len(present) == 1
assert missing == []
Expand Down Expand Up @@ -215,7 +215,7 @@ class TestWorkspace:
"""workspace field anchors relative input_dirs and output_dir."""

def test_relative_input_dirs_resolve_against_workspace(self, tmp_path):
_write_log(tmp_path / "logs", "run.log", "INFO: ok\n")
_write_log(tmp_path / "logs", "run.log", "INFO: ok\n[==tt-log-finish-line==] exit_code=0\n")
config = json.dumps(
{
"model": "test",
Expand All @@ -228,7 +228,7 @@ def test_relative_input_dirs_resolve_against_workspace(self, tmp_path):
assert any(tmp_path.glob("ai_job_summary*.md"))

def test_relative_output_dir_resolves_against_workspace(self, tmp_path):
_write_log(tmp_path / "logs", "run.log", "INFO: ok\n")
_write_log(tmp_path / "logs", "run.log", "INFO: ok\n[==tt-log-finish-line==] exit_code=0\n")
config = json.dumps(
{
"model": "test",
Expand All @@ -245,7 +245,7 @@ def test_env_vars_in_paths_are_expanded(self, tmp_path, monkeypatch):
# This is the only way to express the in-container workspace path
# for container jobs (the ${{ github.workspace }} expression
# always resolves to the host path).
_write_log(tmp_path / "logs", "run.log", "INFO: ok\n")
_write_log(tmp_path / "logs", "run.log", "INFO: ok\n[==tt-log-finish-line==] exit_code=0\n")
monkeypatch.setenv("GITHUB_WORKSPACE", str(tmp_path))
config = json.dumps(
{
Expand All @@ -266,15 +266,15 @@ class TestDualOutput:
"""Tool always writes both .md and .json."""

def test_writes_both_files(self, tmp_path):
_write_log(tmp_path / "run_logs", "run.log", "INFO: all good\n")
_write_log(tmp_path / "run_logs", "run.log", "INFO: all good\n[==tt-log-finish-line==] exit_code=0\n")
_write_log(tmp_path / "docker_server", "server.log", "INFO: server ready\n")
config = _config_json(tmp_path, ["run_logs", "docker_server"])
_run_cli(["--config", config])
assert any(tmp_path.glob("*.md"))
assert any(tmp_path.glob("*.json"))

def test_json_has_job_block(self, tmp_path):
_write_log(tmp_path / "run_logs", "run.log", "INFO: all good\n")
_write_log(tmp_path / "run_logs", "run.log", "INFO: all good\n[==tt-log-finish-line==] exit_code=0\n")
_write_log(tmp_path / "docker_server", "server.log", "INFO: server ready\n")
config = _config_json(tmp_path, ["run_logs", "docker_server"])
_run_cli(["--config", config])
Expand All @@ -283,7 +283,7 @@ def test_json_has_job_block(self, tmp_path):
assert "status" in data["_job"]

def test_filename_uses_job_id_from_url(self, tmp_path):
_write_log(tmp_path / "run_logs", "run.log", "INFO: all good\n")
_write_log(tmp_path / "run_logs", "run.log", "INFO: all good\n[==tt-log-finish-line==] exit_code=0\n")
_write_log(tmp_path / "docker_server", "server.log", "INFO: server ready\n")
config = _config_json(tmp_path, ["run_logs", "docker_server"])
_run_cli(["--config", config, "--job-url", "https://github.com/org/repo/actions/runs/1/job/99999"])
Expand All @@ -292,7 +292,7 @@ def test_filename_uses_job_id_from_url(self, tmp_path):

def test_fallback_filename_without_job_url(self, tmp_path):
"""When no --job-url, files named ai_job_summary.md/.json (no ID suffix)."""
_write_log(tmp_path / "run_logs", "run.log", "INFO: all good\n")
_write_log(tmp_path / "run_logs", "run.log", "INFO: all good\n[==tt-log-finish-line==] exit_code=0\n")
_write_log(tmp_path / "docker_server", "server.log", "INFO: server ready\n")
config = _config_json(tmp_path, ["run_logs", "docker_server"])
_run_cli(["--config", config])
Expand Down Expand Up @@ -517,6 +517,7 @@ def success_dir(self, tmp_path):
2026-03-15 09:00:01,000 - server.py:50 - INFO: Server started on port 8000
2026-03-15 09:01:30,500 - run_workflows.py:101 - INFO: workflow: benchmarks completed with return code: 0
2026-03-15 09:02:46,000 - run.py:550 - INFO: All workflows completed successfully
[==tt-log-finish-line==] exit_code=0
"""
),
)
Expand All @@ -542,6 +543,49 @@ def test_job_name_in_json(self, success_dir):
assert data["_job"]["name"] == "Llama-3.1-8B (n150)"


class TestTimeoutMarkerAbsent:
"""Clean log without the completion marker → GitHub timeout-minutes kill."""

def test_clean_truncated_log_is_timeout_without_llm(self, tmp_path):
_write_log(tmp_path / "logs", "run.log", "INFO: running test_a\nINFO: running test_b\n")
config = _config_json(tmp_path, ["logs"])
with patch("ai_job_summary.cli.get_llm_client") as mock_llm:
_run_cli(["--config", config])
mock_llm.assert_not_called()
data = json.loads(_read_summary(tmp_path, ".json"))
assert data["_job"]["status"] == "TIMEOUT"
assert data["_job"]["log_complete"] is False

def test_marker_present_clean_log_is_success(self, tmp_path):
_write_log(tmp_path / "logs", "run.log", "INFO: ok\n[==tt-log-finish-line==] exit_code=0\n")
config = _config_json(tmp_path, ["logs"])
with patch("ai_job_summary.cli.get_llm_client") as mock_llm:
_run_cli(["--config", config])
mock_llm.assert_not_called()
data = json.loads(_read_summary(tmp_path, ".json"))
assert data["_job"]["status"] == "SUCCESS"

def test_missing_dir_wins_over_absent_marker(self, tmp_path):
# present log markerless + a dir missing → INFRA partial_logs, not TIMEOUT
_write_log(tmp_path / "run_logs", "run.log", "INFO: some log content\n")
config = _config_json(tmp_path, ["run_logs", "docker_server"])
with patch("ai_job_summary.cli.get_llm_client") as mock_llm:
_run_cli(["--config", config])
mock_llm.assert_not_called()
data = json.loads(_read_summary(tmp_path, ".json"))
assert data["_job"]["status"] == "INFRA FAILURE"
assert data["category"] == "infra:partial_logs"

def test_marker_disabled_keeps_clean_log_success(self, tmp_path):
_write_log(tmp_path / "logs", "run.log", "INFO: ok\n")
config = _config_json(tmp_path, ["logs"], log_complete_marker=None)
with patch("ai_job_summary.cli.get_llm_client") as mock_llm:
_run_cli(["--config", config])
mock_llm.assert_not_called()
data = json.loads(_read_summary(tmp_path, ".json"))
assert data["_job"]["status"] == "SUCCESS"


# ── Crash: LLM overrides extraction (using real fixture + mock response) ──────


Expand Down
Loading
Loading