From f3453e2601b4a5bf6237d9f53e3b216cbcbef571 Mon Sep 17 00:00:00 2001 From: mpatilgit-hub9 Date: Wed, 15 Apr 2026 15:34:13 +0530 Subject: [PATCH 1/2] Adding monitoring Scripts Signed-off-by: mpatilgit-hub9 --- .github/workflows/blank.yml | 57 ++++++++++++++++++++ .github/workflows/watchdog.yml | 96 ++++++++++++++++++++++++++++++++++ 2 files changed, 153 insertions(+) create mode 100644 .github/workflows/blank.yml create mode 100644 .github/workflows/watchdog.yml diff --git a/.github/workflows/blank.yml b/.github/workflows/blank.yml new file mode 100644 index 0000000..6d4e22c --- /dev/null +++ b/.github/workflows/blank.yml @@ -0,0 +1,57 @@ +name: CI + +on: + pull_request: + branches: [main] + push: + branches: [main] + workflow_dispatch: + schedule: + - cron: '*/30 * * * *' # Runs in every 30 minutes + +jobs: + build: + strategy: + matrix: + runner: ["ubuntu-24.04-ppc64le", "ubuntu-24.04-ppc64le-p10"] + fail-fast: false + runs-on: ${{ matrix.runner }} + steps: + - uses: actions/checkout@v4 + - name: Run a one-line script + run: echo "Hello, world! GitHub app is running successfully on ${{ matrix.runner }}" + + code-quality: + name: Code quality + strategy: + matrix: + runner: ["ubuntu-24.04-ppc64le", "ubuntu-24.04-ppc64le-p10"] + fail-fast: false + runs-on: ${{ matrix.runner }} + steps: + - uses: actions/checkout@v4 + - name: Run linting + run: | + if [ -x run_linting.sh ]; then + bash run_linting.sh + else + echo "Skipping linting: run_linting.sh not found or not executable" + fi + + + + notify: + name: Notify failed build + needs: + - code-quality + - build + if: failure() && github.event_name != 'pull_request' + runs-on: ubuntu-latest + steps: + - name: Send Slack Alert + run: | + msg="❌ CI Failed\nRepo: ${{ github.repository }}\nBranch: ${{ github.ref_name }}\nRun: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" + + curl -X POST -H 'Content-type: application/json' \ + --data "{\"text\":\"$msg\"}" \ + ${{ secrets.SLACK_WEBHOOK_URL }} diff --git a/.github/workflows/watchdog.yml b/.github/workflows/watchdog.yml new file mode 100644 index 0000000..11b0810 --- /dev/null +++ b/.github/workflows/watchdog.yml @@ -0,0 +1,96 @@ +name: Watchdog - Monitor gha-prod-workflow Status + +on: + schedule: + - cron: "*/5 * * * *" + workflow_dispatch: + +permissions: + actions: read + contents: read + +jobs: + monitor-blank: + runs-on: ubuntu-latest + + steps: + - name: Install dependencies + run: | + sudo apt update + sudo apt install -y gh jq + + - name: Check queued runs of blank.yml + id: check + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + echo "Checking queued runs..." + + runs=$(gh api repos/${{ github.repository }}/actions/workflows/blank.yml/runs \ + --jq '.workflow_runs[] | select(.status=="queued")') + + echo "stale_found=false" >> $GITHUB_ENV + + if [ -z "$runs" ]; then + echo "No queued runs found." + exit 0 + fi + + cutoff_yellow=$(date -u -d "30 minutes ago" +%s) + cutoff_red=$(date -u -d "60 minutes ago" +%s) + cutoff_severe=$(date -u -d "90 minutes ago" +%s) + + echo "$runs" | jq -c '.' > runs.txt + > stale_runs.txt + + while read -r run; do + id=$(echo "$run" | jq -r '.id') + num=$(echo "$run" | jq -r '.run_number') + created=$(echo "$run" | jq -r '.created_at') + created_ts=$(date -d "$created" +%s) + + if [ "$created_ts" -lt "$cutoff_severe" ]; then + severity="severe" + elif [ "$created_ts" -lt "$cutoff_red" ]; then + severity="red" + elif [ "$created_ts" -lt "$cutoff_yellow" ]; then + severity="yellow" + else + continue + fi + + echo "$id,$num,$created,$severity" >> stale_runs.txt + + done < runs.txt + + if [ -s stale_runs.txt ]; then + echo "stale_found=true" >> $GITHUB_ENV + fi + + - name: Send Slack Alert if stale jobs found + if: env.stale_found == 'true' + run: | + WEBHOOK="${{ secrets.SLACK_WEBHOOK_URL }}" + + while IFS=',' read -r id num created severity; do + [ -z "$id" ] && continue + + run_url="https://github.com/${{ github.repository }}/actions/runs/$id" + + if [ "$severity" = "severe" ]; then + msg="πŸ”₯ Severe Alert: Job stuck >90 min \nRun: $num\nCreated: $created\n$run_url" + elif [ "$severity" = "red" ]; then + msg="🚨 Red Alert: Job stuck >60 min \nRun: $num\nCreated: $created\n$run_url" + else + msg="⚠️ Yellow Alert: Job stuck >30 min \nRun: $num\nCreated: $created\n$run_url" + fi + + payload=$(jq -n --arg text "$msg" '{text: $text}') + + curl -X POST -H 'Content-type: application/json' \ + --data "$payload" \ + "$WEBHOOK" + + sleep 1 + + done < stale_runs.txt From fc609f39b78f61efef5d6cd1b9b574e09b1007de Mon Sep 17 00:00:00 2001 From: mpatilgit-hub9 Date: Tue, 28 Apr 2026 21:12:02 +0530 Subject: [PATCH 2/2] Splitted the workflows for default and large type and also added health check logic (CPU, I/o, Memory, Process) Signed-off-by: mpatilgit-hub9 --- .github/workflows/Default-Runners-Fleet.yml | 58 ++++++ .github/workflows/Large-Workers-Fleet.yml | 53 ++++++ .github/workflows/blank.yml | 57 ------ .github/workflows/watchdog.yml | 190 +++++++++++++------- 4 files changed, 233 insertions(+), 125 deletions(-) create mode 100644 .github/workflows/Default-Runners-Fleet.yml create mode 100644 .github/workflows/Large-Workers-Fleet.yml delete mode 100644 .github/workflows/blank.yml diff --git a/.github/workflows/Default-Runners-Fleet.yml b/.github/workflows/Default-Runners-Fleet.yml new file mode 100644 index 0000000..5e8f4d5 --- /dev/null +++ b/.github/workflows/Default-Runners-Fleet.yml @@ -0,0 +1,58 @@ +name: Default-Runners-Fleet + +on: + pull_request: + branches: [main] + push: + branches: [main] + workflow_dispatch: + schedule: + - cron: '*/30 * * * *' + +jobs: + build: + strategy: + matrix: + runner: [ + "ubuntu-24.04-ppc64le", + "ubuntu-24.04-ppc64le-p10", + "ubuntu-24.04-s390x" + ] + fail-fast: false + + runs-on: ${{ matrix.runner }} + + steps: + - uses: actions/checkout@v4 + + - name: Runner info + run: | + echo "Runner: ${{ matrix.runner }}" + uname -a + uptime + + # CPU test + - name: CPU test + run: timeout 20s bash -c "yes > /dev/null" || true + + # IO test + - name: IO test + run: | + set -e + dd if=/dev/zero of=testfile bs=1M count=256 oflag=dsync + rm -f testfile + + # Network test + - name: Network test + run: | + set -e + ping -c 3 github.com + curl -I https://github.com + + # Memory + - name: Memory check + run: free -h + + # Process check + - name: Process snapshot + run: ps aux --sort=-%mem | head -10 diff --git a/.github/workflows/Large-Workers-Fleet.yml b/.github/workflows/Large-Workers-Fleet.yml new file mode 100644 index 0000000..5e3fdb2 --- /dev/null +++ b/.github/workflows/Large-Workers-Fleet.yml @@ -0,0 +1,53 @@ +name: Large-Workers-Fleet + +on: + workflow_dispatch: + schedule: + - cron: '0 */1 * * *' # every 1 hour + +jobs: + health-check: + strategy: + matrix: + runner: [ + "ubuntu-24.04-ppc64le-2xlarge", + "ubuntu-24.04-ppc64le-2xlarge-p10", + "ubuntu-24.04-ppc64le-4xlarge", + "ubuntu-24.04-ppc64le-4xlarge-p10" + ] + fail-fast: false + + runs-on: ${{ matrix.runner }} + + steps: + - uses: actions/checkout@v4 + + - name: Runner info + run: | + echo "Runner: ${{ matrix.runner }}" + uname -a + uptime + + # CPU stress test + - name: CPU test + run: timeout 25s bash -c "yes > /dev/null" || true + + # IO heavy test (larger) + - name: IO test + run: | + dd if=/dev/zero of=testfile bs=1M count=512 oflag=dsync + rm -f testfile + + # Network test + - name: Network test + run: | + ping -c 4 github.com + curl -I https://github.com + + # Memory check + - name: Memory check + run: free -h + + # Process snapshot + - name: Process snapshot + run: ps aux --sort=-%mem | head -10 diff --git a/.github/workflows/blank.yml b/.github/workflows/blank.yml deleted file mode 100644 index 6d4e22c..0000000 --- a/.github/workflows/blank.yml +++ /dev/null @@ -1,57 +0,0 @@ -name: CI - -on: - pull_request: - branches: [main] - push: - branches: [main] - workflow_dispatch: - schedule: - - cron: '*/30 * * * *' # Runs in every 30 minutes - -jobs: - build: - strategy: - matrix: - runner: ["ubuntu-24.04-ppc64le", "ubuntu-24.04-ppc64le-p10"] - fail-fast: false - runs-on: ${{ matrix.runner }} - steps: - - uses: actions/checkout@v4 - - name: Run a one-line script - run: echo "Hello, world! GitHub app is running successfully on ${{ matrix.runner }}" - - code-quality: - name: Code quality - strategy: - matrix: - runner: ["ubuntu-24.04-ppc64le", "ubuntu-24.04-ppc64le-p10"] - fail-fast: false - runs-on: ${{ matrix.runner }} - steps: - - uses: actions/checkout@v4 - - name: Run linting - run: | - if [ -x run_linting.sh ]; then - bash run_linting.sh - else - echo "Skipping linting: run_linting.sh not found or not executable" - fi - - - - notify: - name: Notify failed build - needs: - - code-quality - - build - if: failure() && github.event_name != 'pull_request' - runs-on: ubuntu-latest - steps: - - name: Send Slack Alert - run: | - msg="❌ CI Failed\nRepo: ${{ github.repository }}\nBranch: ${{ github.ref_name }}\nRun: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" - - curl -X POST -H 'Content-type: application/json' \ - --data "{\"text\":\"$msg\"}" \ - ${{ secrets.SLACK_WEBHOOK_URL }} diff --git a/.github/workflows/watchdog.yml b/.github/workflows/watchdog.yml index 11b0810..04c7e36 100644 --- a/.github/workflows/watchdog.yml +++ b/.github/workflows/watchdog.yml @@ -1,96 +1,150 @@ -name: Watchdog - Monitor gha-prod-workflow Status +name: Watchdog - Multi Workflow Health Monitor on: schedule: - - cron: "*/5 * * * *" + - cron: "*/5 * * * *" # Runs every 5 minutes workflow_dispatch: permissions: actions: read - contents: read + contents: write jobs: - monitor-blank: + monitor: runs-on: ubuntu-latest + defaults: + run: + shell: bash + steps: - name: Install dependencies run: | sudo apt update sudo apt install -y gh jq - - name: Check queued runs of blank.yml - id: check + - name: Checkout repo (for state persistence) + uses: actions/checkout@v4 + + - name: Monitor workflows env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} run: | - echo "Checking queued runs..." - - runs=$(gh api repos/${{ github.repository }}/actions/workflows/blank.yml/runs \ - --jq '.workflow_runs[] | select(.status=="queued")') - - echo "stale_found=false" >> $GITHUB_ENV - - if [ -z "$runs" ]; then - echo "No queued runs found." - exit 0 + set -e + + owner_repo="${{ github.repository }}" + workflows=("Default-Runners-Fleet.yml" "Large-Workers-Fleet.yml") + + state_file="watchdog_state.json" + + + if [ -f "$state_file" ]; then + state=$(cat "$state_file") + else + state='{}' fi - - cutoff_yellow=$(date -u -d "30 minutes ago" +%s) - cutoff_red=$(date -u -d "60 minutes ago" +%s) - cutoff_severe=$(date -u -d "90 minutes ago" +%s) - - echo "$runs" | jq -c '.' > runs.txt - > stale_runs.txt - - while read -r run; do - id=$(echo "$run" | jq -r '.id') - num=$(echo "$run" | jq -r '.run_number') - created=$(echo "$run" | jq -r '.created_at') - created_ts=$(date -d "$created" +%s) - - if [ "$created_ts" -lt "$cutoff_severe" ]; then - severity="severe" - elif [ "$created_ts" -lt "$cutoff_red" ]; then - severity="red" - elif [ "$created_ts" -lt "$cutoff_yellow" ]; then - severity="yellow" - else + + new_state='{}' + alerts="" + + for wf in "${workflows[@]}"; do + echo "Checking workflow: $wf" + + runs=$(gh api repos/$owner_repo/actions/workflows/$wf/runs --jq '.workflow_runs') + latest=$(echo "$runs" | jq '.[0]') + + if [ "$latest" = "null" ]; then + echo "No runs found for $wf" continue fi + + status=$(echo "$latest" | jq -r '.status // "unknown"') + conclusion=$(echo "$latest" | jq -r '.conclusion // "unknown"') + created=$(echo "$latest" | jq -r '.created_at') + html_url=$(echo "$latest" | jq -r '.html_url') + + prev=$(echo "$state" | jq -r --arg wf "$wf" '.[$wf] // "none"') + + echo "Status: $status | Conclusion: $conclusion | Prev: $prev" + + # Time calculation + now=$(date -u +"%Y-%m-%dT%H:%M:%SZ") + created_epoch=$(date -d "$created" +%s 2>/dev/null || echo 0) + now_epoch=$(date -d "$now" +%s 2>/dev/null || echo 0) + queued_minutes=$(( (now_epoch - created_epoch) / 60 )) + + # QUEUED LOGIC WITH SEVERITY + if [ "$status" = "queued" ]; then + + if [ "$queued_minutes" -ge 90 ]; then + alerts="${alerts} + 🚨 *SEVERE*: Workflow *$wf* stuck in queue for *${queued_minutes} mins* + πŸ”— View Run: ${html_url}" + + elif [ "$queued_minutes" -ge 60 ]; then + alerts="${alerts} + πŸ”΄ *HIGH*: Workflow *$wf* queued for *${queued_minutes} mins* + πŸ”— View Run: ${html_url}" + + elif [ "$queued_minutes" -ge 30 ]; then + alerts="${alerts} + 🟑 *WARNING*: Workflow *$wf* queued for *${queued_minutes} mins* + πŸ”— View Run: ${html_url}" + fi - echo "$id,$num,$created,$severity" >> stale_runs.txt - - done < runs.txt - - if [ -s stale_runs.txt ]; then - echo "stale_found=true" >> $GITHUB_ENV - fi + new_state=$(echo "$new_state" | jq --arg wf "$wf" --arg val "queued_$queued_minutes" '.[$wf]=$val') - - name: Send Slack Alert if stale jobs found - if: env.stale_found == 'true' - run: | - WEBHOOK="${{ secrets.SLACK_WEBHOOK_URL }}" - - while IFS=',' read -r id num created severity; do - [ -z "$id" ] && continue - - run_url="https://github.com/${{ github.repository }}/actions/runs/$id" - - if [ "$severity" = "severe" ]; then - msg="πŸ”₯ Severe Alert: Job stuck >90 min \nRun: $num\nCreated: $created\n$run_url" - elif [ "$severity" = "red" ]; then - msg="🚨 Red Alert: Job stuck >60 min \nRun: $num\nCreated: $created\n$run_url" - else - msg="⚠️ Yellow Alert: Job stuck >30 min \nRun: $num\nCreated: $created\n$run_url" + # FAILURE LOGIC + elif [ "$status" = "completed" ] && [ "$conclusion" = "failure" ]; then + if [ "$prev" != "failure" ]; then + alerts="${alerts} + ❌*FAILED*: Workflow *$wf* failed" fi + new_state=$(echo "$new_state" | jq --arg wf "$wf" --arg val "failure" '.[$wf]=$val') + + # RECOVERY LOGIC + elif [ "$status" = "completed" ] && [ "$conclusion" = "success" ]; then + if [[ "$prev" == failure* || "$prev" == queued* ]]; then + alerts="${alerts} + βœ… *RECOVERED*: Workflow *$wf* is back to normal + πŸ”— View Run: ${html_url}" + fi + new_state=$(echo "$new_state" | jq --arg wf "$wf" --arg val "success" '.[$wf]=$val') - payload=$(jq -n --arg text "$msg" '{text: $text}') - - curl -X POST -H 'Content-type: application/json' \ - --data "$payload" \ - "$WEBHOOK" + else + new_state=$(echo "$new_state" | jq --arg wf "$wf" --arg val "$status" '.[$wf]=$val') + fi - sleep 1 + done + + # Send Slack alert if needed + if [ -n "$alerts" ]; then + echo -e "Alerts: $alerts" + + payload=$(jq -n --arg text "$alerts" '{ + "blocks": [ + { + "type": "header", + "text": { + "type": "plain_text", + "text": "🚨Workflow Health Alert" + } + }, + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": $text + } + } + ] + }') + + curl -s POST -H "Content-type: application/json" \ + --data "$payload" \ + "$SLACK_WEBHOOK_URL" + fi - done < stale_runs.txt + # Save new state + echo "$new_state" > "$state_file"