From f3453e2601b4a5bf6237d9f53e3b216cbcbef571 Mon Sep 17 00:00:00 2001 From: mpatilgit-hub9 Date: Wed, 15 Apr 2026 15:34:13 +0530 Subject: [PATCH 1/2] Adding monitoring Scripts Signed-off-by: mpatilgit-hub9 --- .github/workflows/blank.yml | 57 ++++++++++++++++++++ .github/workflows/watchdog.yml | 96 ++++++++++++++++++++++++++++++++++ 2 files changed, 153 insertions(+) create mode 100644 .github/workflows/blank.yml create mode 100644 .github/workflows/watchdog.yml diff --git a/.github/workflows/blank.yml b/.github/workflows/blank.yml new file mode 100644 index 0000000..6d4e22c --- /dev/null +++ b/.github/workflows/blank.yml @@ -0,0 +1,57 @@ +name: CI + +on: + pull_request: + branches: [main] + push: + branches: [main] + workflow_dispatch: + schedule: + - cron: '*/30 * * * *' # Runs in every 30 minutes + +jobs: + build: + strategy: + matrix: + runner: ["ubuntu-24.04-ppc64le", "ubuntu-24.04-ppc64le-p10"] + fail-fast: false + runs-on: ${{ matrix.runner }} + steps: + - uses: actions/checkout@v4 + - name: Run a one-line script + run: echo "Hello, world! GitHub app is running successfully on ${{ matrix.runner }}" + + code-quality: + name: Code quality + strategy: + matrix: + runner: ["ubuntu-24.04-ppc64le", "ubuntu-24.04-ppc64le-p10"] + fail-fast: false + runs-on: ${{ matrix.runner }} + steps: + - uses: actions/checkout@v4 + - name: Run linting + run: | + if [ -x run_linting.sh ]; then + bash run_linting.sh + else + echo "Skipping linting: run_linting.sh not found or not executable" + fi + + + + notify: + name: Notify failed build + needs: + - code-quality + - build + if: failure() && github.event_name != 'pull_request' + runs-on: ubuntu-latest + steps: + - name: Send Slack Alert + run: | + msg="❌ CI Failed\nRepo: ${{ github.repository }}\nBranch: ${{ github.ref_name }}\nRun: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" + + curl -X POST -H 'Content-type: application/json' \ + --data "{\"text\":\"$msg\"}" \ + ${{ secrets.SLACK_WEBHOOK_URL }} diff --git a/.github/workflows/watchdog.yml b/.github/workflows/watchdog.yml new file mode 100644 index 0000000..11b0810 --- /dev/null +++ b/.github/workflows/watchdog.yml @@ -0,0 +1,96 @@ +name: Watchdog - Monitor gha-prod-workflow Status + +on: + schedule: + - cron: "*/5 * * * *" + workflow_dispatch: + +permissions: + actions: read + contents: read + +jobs: + monitor-blank: + runs-on: ubuntu-latest + + steps: + - name: Install dependencies + run: | + sudo apt update + sudo apt install -y gh jq + + - name: Check queued runs of blank.yml + id: check + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + echo "Checking queued runs..." + + runs=$(gh api repos/${{ github.repository }}/actions/workflows/blank.yml/runs \ + --jq '.workflow_runs[] | select(.status=="queued")') + + echo "stale_found=false" >> $GITHUB_ENV + + if [ -z "$runs" ]; then + echo "No queued runs found." + exit 0 + fi + + cutoff_yellow=$(date -u -d "30 minutes ago" +%s) + cutoff_red=$(date -u -d "60 minutes ago" +%s) + cutoff_severe=$(date -u -d "90 minutes ago" +%s) + + echo "$runs" | jq -c '.' > runs.txt + > stale_runs.txt + + while read -r run; do + id=$(echo "$run" | jq -r '.id') + num=$(echo "$run" | jq -r '.run_number') + created=$(echo "$run" | jq -r '.created_at') + created_ts=$(date -d "$created" +%s) + + if [ "$created_ts" -lt "$cutoff_severe" ]; then + severity="severe" + elif [ "$created_ts" -lt "$cutoff_red" ]; then + severity="red" + elif [ "$created_ts" -lt "$cutoff_yellow" ]; then + severity="yellow" + else + continue + fi + + echo "$id,$num,$created,$severity" >> stale_runs.txt + + done < runs.txt + + if [ -s stale_runs.txt ]; then + echo "stale_found=true" >> $GITHUB_ENV + fi + + - name: Send Slack Alert if stale jobs found + if: env.stale_found == 'true' + run: | + WEBHOOK="${{ secrets.SLACK_WEBHOOK_URL }}" + + while IFS=',' read -r id num created severity; do + [ -z "$id" ] && continue + + run_url="https://github.com/${{ github.repository }}/actions/runs/$id" + + if [ "$severity" = "severe" ]; then + msg="🔥 Severe Alert: Job stuck >90 min \nRun: $num\nCreated: $created\n$run_url" + elif [ "$severity" = "red" ]; then + msg="🚨 Red Alert: Job stuck >60 min \nRun: $num\nCreated: $created\n$run_url" + else + msg="⚠️ Yellow Alert: Job stuck >30 min \nRun: $num\nCreated: $created\n$run_url" + fi + + payload=$(jq -n --arg text "$msg" '{text: $text}') + + curl -X POST -H 'Content-type: application/json' \ + --data "$payload" \ + "$WEBHOOK" + + sleep 1 + + done < stale_runs.txt From 42d480086dd9ca5ed9bce075b8de0104f0b70f78 Mon Sep 17 00:00:00 2001 From: mpatilgit-hub9 Date: Wed, 15 Apr 2026 19:14:35 +0530 Subject: [PATCH 2/2] Expand runner coverage for all worker types Signed-off-by: mpatilgit-hub9 --- .github/workflows/blank.yml | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/.github/workflows/blank.yml b/.github/workflows/blank.yml index 6d4e22c..f353e51 100644 --- a/.github/workflows/blank.yml +++ b/.github/workflows/blank.yml @@ -7,13 +7,21 @@ on: branches: [main] workflow_dispatch: schedule: - - cron: '*/30 * * * *' # Runs in every 30 minutes + - cron: '*/30 * * * *' jobs: build: strategy: matrix: - runner: ["ubuntu-24.04-ppc64le", "ubuntu-24.04-ppc64le-p10"] + runner: [ + "ubuntu-24.04-ppc64le", + "ubuntu-24.04-ppc64le-p10", + "ubuntu-24.04-ppc64le-2xlarge", + "ubuntu-24.04-ppc64le-2xlarge-p10", + "ubuntu-24.04-ppc64le-4xlarge", + "ubuntu-24.04-ppc64le-4xlarge-p10", + "ubuntu-24.04-s390x" + ] fail-fast: false runs-on: ${{ matrix.runner }} steps: @@ -25,7 +33,15 @@ jobs: name: Code quality strategy: matrix: - runner: ["ubuntu-24.04-ppc64le", "ubuntu-24.04-ppc64le-p10"] + runner: [ + "ubuntu-24.04-ppc64le", + "ubuntu-24.04-ppc64le-p10", + "ubuntu-24.04-ppc64le-2xlarge", + "ubuntu-24.04-ppc64le-2xlarge-p10", + "ubuntu-24.04-ppc64le-4xlarge", + "ubuntu-24.04-ppc64le-4xlarge-p10", + "ubuntu-24.04-s390x" + ] fail-fast: false runs-on: ${{ matrix.runner }} steps: @@ -38,8 +54,6 @@ jobs: echo "Skipping linting: run_linting.sh not found or not executable" fi - - notify: name: Notify failed build needs: