Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 53 additions & 3 deletions validate-cascade.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,18 @@
r"repos/crunchtools/([A-Za-z0-9._-]+)/dispatches",
)

# Image-publication signals in workflow YAML — used to detect crunchtools images
# that are BUILT (and pushed to Quay) by some workflow, even if no GitHub repo of
# that name exists. acquacotta builds quay.io/crunchtools/acquacotta-base from
# Containerfile.base inside the acquacotta repo via container-base.yml; rotv does
# the same with rotv-base. Either of these patterns is sufficient evidence.
PUBLISHED_IMAGE_RES = [
# env declaration: `IMAGE_NAME: crunchtools/foo` (any *_IMAGE name)
re.compile(r"^\s*[A-Z_]*IMAGE[A-Z_]*:\s*crunchtools/([A-Za-z0-9._-]+)", re.MULTILINE),
# literal reference in tags / image fields: `quay.io/crunchtools/foo[:tag]`
re.compile(r"quay\.io/crunchtools/([A-Za-z0-9._-]+?)(?:[:@\s\"',}]|$)"),
]


def gh(path: str, token: str) -> dict | list:
req = urllib.request.Request(
Expand Down Expand Up @@ -115,9 +127,45 @@ def main() -> int:
repo_set = set(repos)
print(f" {len(repos)} non-archived repos", file=sys.stderr)

# Discover every crunchtools image that is BUILT by some workflow in some
# repo (typically pushed to quay.io/crunchtools/<name>). This catches the
# "same repo publishes both an app and its base" pattern (acquacotta builds
# acquacotta-base via container-base.yml; rotv builds rotv-base via
# build-base.yml). A FROM pointing at such an image is NOT broken even
# though no separate repo of that name exists.
print("Scanning workflows for published images...", file=sys.stderr)
published_images: set[str] = set()
# also cache workflow files so we don't re-fetch in the dispatch pass
workflows_cache: dict[tuple[str, str], str] = {}
for r in repos:
try:
wf_entries = gh(f"/repos/{args.org}/{r}/contents/.github/workflows", token)
except urllib.error.HTTPError as e:
if e.code == 404:
continue
raise
if not isinstance(wf_entries, list):
continue
for entry in wf_entries:
name = entry.get("name", "")
if not name.endswith((".yml", ".yaml")):
continue
Comment on lines +149 to +152

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

When scanning the .github/workflows directory, some entries might not be regular files (e.g., subdirectories, symlinks, or submodules). Calling fetch_text on a non-file entry will cause a TypeError or KeyError because the GitHub API response for a directory listing does not contain a content field.

To prevent potential crashes, explicitly filter out non-file entries by checking entry.get("type") == "file".

Suggested change
for entry in wf_entries:
name = entry.get("name", "")
if not name.endswith((".yml", ".yaml")):
continue
for entry in wf_entries:
if entry.get("type") != "file":
continue
name = entry.get("name", "")
if not name.endswith((".yml", ".yaml")):
continue

txt = fetch_text(args.org, r, f".github/workflows/{name}", token)
if txt is None:
continue
workflows_cache[(r, name)] = txt
for rx in PUBLISHED_IMAGE_RES:
for m in rx.finditer(txt):
published_images.add(m.group(1))
# A repo also "exists" as a publishable image if it has a Containerfile,
# since the standard pattern pushes quay.io/crunchtools/<reponame>.
known_images = repo_set | published_images
print(f" {len(published_images)} crunchtools image names found in workflows", file=sys.stderr)

# FROM-graph: parent_image -> {child_repo, ...}
from_graph: dict[str, set[str]] = defaultdict(set)
# Track unresolved FROM targets (broken edges)
# Track unresolved FROM targets (broken edges) — a FROM target is broken
# only if no repo AND no workflow-published image of that name exists.
broken_froms: list[tuple[str, str]] = [] # (child_repo, missing_parent)

for r in repos:
Expand All @@ -129,14 +177,16 @@ def main() -> int:
for m in FROM_RE.finditer(cf):
parent = m.group(1).split(":")[0] # strip :tag if any
from_graph[parent].add(r)
if parent not in repo_set:
if parent not in known_images:
broken_froms.append((r, parent))

# dispatch-graph: parent_repo -> {dispatched_child, ...}
dispatch_graph: dict[str, set[str]] = defaultdict(set)
for r in repos:
for wf in ("build.yml", "container.yml"):
txt = fetch_text(args.org, r, f".github/workflows/{wf}", token)
txt = workflows_cache.get((r, wf))
if txt is None:
txt = fetch_text(args.org, r, f".github/workflows/{wf}", token)
if txt is None:
Comment on lines +187 to 190

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

Since workflows_cache is populated by scanning the entire .github/workflows directory for every repository, any existing workflow file is already cached. If a workflow file (like build.yml or container.yml) is not present in workflows_cache, it means the file does not exist in the repository.

Calling fetch_text here is redundant and will always result in a useless 404 API request. Since most repositories only have one of these two workflow files, this causes an unnecessary 404 request for almost every repository, which can easily trigger GitHub API rate limits and slow down execution.

We can safely remove the fallback to fetch_text and also handle the .yaml extension if the .yml lookup fails.

Suggested change
txt = workflows_cache.get((r, wf))
if txt is None:
txt = fetch_text(args.org, r, f".github/workflows/{wf}", token)
if txt is None:
txt = workflows_cache.get((r, wf))
if txt is None and wf.endswith(".yml"):
txt = workflows_cache.get((r, wf[:-4] + ".yaml"))
if txt is None:
continue

continue
for m in DISPATCH_LOOP_RE.finditer(txt):
Expand Down
Loading