From f6bbe8712b7db4bfe959a707ab4f63d62e81eba8 Mon Sep 17 00:00:00 2001 From: Roman Lemekha Date: Thu, 25 Jun 2026 18:38:57 +0200 Subject: [PATCH] feat: add gitleaks + privacy-scan reusables and pre-commit hook Centralize the secret/privacy scanning so all repos share one source of truth instead of agent-driven manual review: - gitleaks-reusable.yml: secret scan over full history (workflow_call), lifted from docker_infra's local gitleaks.yml. - privacy-scan-reusable.yml + scripts/privacy-scan.sh: fail if tracked files contain private-infrastructure markers (private IPs, .local hosts, host paths, key/secret markers). Built-in patterns are GENERIC (safe for this public repo); callers pass their own private hostnames via the extra-patterns input / EXTRA_PATTERNS env, so secret-ish values stay in the private caller. The reusable checks out the script from this repo at job.workflow_sha so the rules cannot be tampered with by the calling repo's tree. - .pre-commit-hooks.yaml: exposes the same script as a pre-commit hook, so the local hook and CI enforce identical rules. Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/workflows/gitleaks-reusable.yml | 27 ++++++ .github/workflows/privacy-scan-reusable.yml | 59 +++++++++++++ .pre-commit-hooks.yaml | 22 +++++ scripts/privacy-scan.sh | 91 +++++++++++++++++++++ 4 files changed, 199 insertions(+) create mode 100644 .github/workflows/gitleaks-reusable.yml create mode 100644 .github/workflows/privacy-scan-reusable.yml create mode 100644 .pre-commit-hooks.yaml create mode 100755 scripts/privacy-scan.sh diff --git a/.github/workflows/gitleaks-reusable.yml b/.github/workflows/gitleaks-reusable.yml new file mode 100644 index 0000000..a372948 --- /dev/null +++ b/.github/workflows/gitleaks-reusable.yml @@ -0,0 +1,27 @@ +# Reusable gitleaks secret scan. Callers delegate here: +# +# jobs: +# gitleaks: +# uses: roleme/workflows/.github/workflows/gitleaks-reusable.yml@ +# secrets: inherit +# +# fetch-depth: 0 lets gitleaks scan full history, not just the tip. +name: gitleaks (reusable) + +on: + workflow_call: {} + +permissions: + contents: read + +jobs: + scan: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 + with: + fetch-depth: 0 + persist-credentials: false + - uses: gitleaks/gitleaks-action@e0c47f4f8be36e29cdc102c57e68cb5cbf0e8d1e # v3 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/privacy-scan-reusable.yml b/.github/workflows/privacy-scan-reusable.yml new file mode 100644 index 0000000..ea16db2 --- /dev/null +++ b/.github/workflows/privacy-scan-reusable.yml @@ -0,0 +1,59 @@ +# Reusable privacy scan: fails if tracked files contain private-infrastructure +# markers (private IPs, .local hosts, host paths, key/secret markers, plus any +# caller-supplied hostnames). Runs scripts/privacy-scan.sh from this repo. +# +# Callers delegate here and pass their own private hostnames via extra-patterns +# (newline-separated extended-regexps). Those patterns stay in the caller's +# (private) repo and run in the caller's (private) Actions context — they are +# NOT hardcoded in this public repo: +# +# jobs: +# privacy-scan: +# uses: roleme/workflows/.github/workflows/privacy-scan-reusable.yml@ +# with: +# extra-patterns: | +# example-private-host\.example +name: privacy-scan (reusable) + +on: + workflow_call: + inputs: + extra-patterns: + description: Newline-separated extended-regexps of extra strings to flag (e.g. private hostnames) + required: false + type: string + default: "" + +permissions: + contents: read + +jobs: + privacy-scan: + runs-on: ubuntu-latest + steps: + # The caller's own code is checked out (default), but the scan SCRIPT must + # come from this workflows repo. Fetch it explicitly so the rules cannot + # be tampered with by the calling repo's working tree. + - name: Checkout caller repo + uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6 + with: + persist-credentials: false + + - name: Fetch scan script from workflows repo + # job.workflow_repository + job.workflow_sha resolve to THIS reusable + # workflow's repo at the exact commit the caller pinned — so the scan + # rules cannot be swapped by the calling repo's working tree, and a + # caller pinned to an older SHA gets that SHA's script (not main). + uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6 + with: + repository: ${{ job.workflow_repository }} + ref: ${{ job.workflow_sha }} + path: .privacy-scan-tools + persist-credentials: false + + - name: Run privacy scan + # extra-patterns is a workflow input (trusted caller config). Pass it via + # env and quote it so it is never spliced into the command line. + env: + EXTRA_PATTERNS: ${{ inputs.extra-patterns }} + run: bash .privacy-scan-tools/scripts/privacy-scan.sh diff --git a/.pre-commit-hooks.yaml b/.pre-commit-hooks.yaml new file mode 100644 index 0000000..0c62be7 --- /dev/null +++ b/.pre-commit-hooks.yaml @@ -0,0 +1,22 @@ +# Lets repos use this script as a local pre-commit hook via the pre-commit +# framework. In a consuming repo's .pre-commit-config.yaml: +# +# repos: +# - repo: https://github.com/roleme/workflows +# rev: +# hooks: +# - id: privacy-scan +# # private hostnames stay in the consuming repo's config, not here: +# args: ["--"] +# # pass extra patterns via env in the hook invocation, e.g.: +# # EXTRA_PATTERNS=$'domovas\\.uk' git commit ... +# +# The scan reads EXTRA_PATTERNS from the environment (same as CI), so private +# hostnames are supplied by the developer/repo, never stored in this public repo. +- id: privacy-scan + name: privacy scan (private-infra markers) + description: Fail the commit if tracked files contain private-infrastructure markers. + entry: scripts/privacy-scan.sh + language: script + pass_filenames: false + always_run: true diff --git a/scripts/privacy-scan.sh b/scripts/privacy-scan.sh new file mode 100755 index 0000000..740ab9d --- /dev/null +++ b/scripts/privacy-scan.sh @@ -0,0 +1,91 @@ +#!/usr/bin/env bash +# privacy-scan.sh — fail if tracked files contain private infrastructure +# markers. Shared by the reusable CI workflow and the local pre-commit hook so +# both enforce the exact same rules. +# +# Scans all tracked files (git ls-files). Built-in patterns are GENERIC and +# safe to live in this PUBLIC repo (private IP ranges, .local hosts, common +# host paths, key/secret markers). Caller-specific private hostnames are NOT +# hardcoded here — a private caller passes them via $EXTRA_PATTERNS +# (newline-separated extended-regexps), so the secret-ish list stays in the +# private repo that runs the scan, never in this public one. +# +# Usage: +# EXTRA_PATTERNS=$'domovas\\.uk\nmininas\\.local' scripts/privacy-scan.sh +# +# Env: +# EXTRA_PATTERNS newline-separated ERE patterns to also flag (optional) +# ALLOW_FILE path to a file listing path globs to skip (optional) +# +# Exits non-zero (and prints offending file:line) if anything matches. +set -euo pipefail + +# Generic markers — no private values, safe for a public repo. +builtin_patterns=( + '\b10\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\b' # 10.0.0.0/8 + '\b192\.168\.[0-9]{1,3}\.[0-9]{1,3}\b' # 192.168.0.0/16 + '\b172\.(1[6-9]|2[0-9]|3[01])\.[0-9]{1,3}\.[0-9]{1,3}\b' # 172.16.0.0/12 + '\b[a-z0-9-]+\.local\b' # mDNS/.local hosts + '/home/[a-z0-9_-]+/' # host home paths + '/volume[0-9]+/' # Synology volume paths + '/etc/komodo/' # komodo host config + 'ghp_[A-Za-z0-9]{30,}' # GitHub PAT + 'github_pat_[A-Za-z0-9_]{30,}' # fine-grained PAT + 'xox[baprs]-[A-Za-z0-9-]{10,}' # Slack token + '-----BEGIN[A-Z ]*PRIVATE KEY-----' # private keys + 'cli_secret' # komodo cli secret key +) + +# Caller-supplied extra patterns (one ERE per line). +mapfile -t extra_patterns < <(printf '%s' "${EXTRA_PATTERNS:-}" | sed '/^[[:space:]]*$/d') + +patterns=("${builtin_patterns[@]}" "${extra_patterns[@]}") + +# This script defines the patterns, so it would match itself — exclude it, plus +# the lockfile-ish / binary stuff that produces noise. Callers can extend via +# ALLOW_FILE (one path-glob per line). +exclude_globs=( + '*privacy-scan.sh' + '*.png' '*.jpg' '*.jpeg' '*.gif' '*.ico' '*.pdf' '*.lock' +) +if [[ -n "${ALLOW_FILE:-}" && -f "${ALLOW_FILE}" ]]; then + while IFS= read -r line; do + [[ -z "$line" || "$line" == \#* ]] && continue + exclude_globs+=("$line") + done < "${ALLOW_FILE}" +fi + +is_excluded() { + local f="$1" g + for g in "${exclude_globs[@]}"; do + # shellcheck disable=SC2053 + [[ "$f" == $g ]] && return 0 + done + return 1 +} + +# Build a single alternation for one grep pass per file. +joined=$(printf '%s|' "${patterns[@]}") +joined="${joined%|}" + +found=0 +while IFS= read -r f; do + is_excluded "$f" && continue + # -I skips binary files; -n gives line numbers; -E extended regex. + if matches=$(grep -InE "$joined" -- "$f" 2>/dev/null); then + found=1 + echo "::error file=$f::private-infra marker found" 2>/dev/null || true + while IFS= read -r m; do + echo " $f:$m" + done <<< "$matches" + fi +done < <(git ls-files) + +if [[ "$found" -ne 0 ]]; then + echo "" + echo "privacy-scan: found private-infrastructure markers above." >&2 + echo "If a match is a false positive, exclude its path via ALLOW_FILE." >&2 + exit 1 +fi + +echo "privacy-scan: clean — no private-infrastructure markers found."