diff --git a/.github/workflows/Default-Runners-Fleet.yml b/.github/workflows/Default-Runners-Fleet.yml new file mode 100644 index 0000000..5e8f4d5 --- /dev/null +++ b/.github/workflows/Default-Runners-Fleet.yml @@ -0,0 +1,58 @@ +name: Default-Runners-Fleet + +on: + pull_request: + branches: [main] + push: + branches: [main] + workflow_dispatch: + schedule: + - cron: '*/30 * * * *' + +jobs: + build: + strategy: + matrix: + runner: [ + "ubuntu-24.04-ppc64le", + "ubuntu-24.04-ppc64le-p10", + "ubuntu-24.04-s390x" + ] + fail-fast: false + + runs-on: ${{ matrix.runner }} + + steps: + - uses: actions/checkout@v4 + + - name: Runner info + run: | + echo "Runner: ${{ matrix.runner }}" + uname -a + uptime + + # CPU test + - name: CPU test + run: timeout 20s bash -c "yes > /dev/null" || true + + # IO test + - name: IO test + run: | + set -e + dd if=/dev/zero of=testfile bs=1M count=256 oflag=dsync + rm -f testfile + + # Network test + - name: Network test + run: | + set -e + ping -c 3 github.com + curl -I https://github.com + + # Memory + - name: Memory check + run: free -h + + # Process check + - name: Process snapshot + run: ps aux --sort=-%mem | head -10 diff --git a/.github/workflows/Large-Workers-Fleet.yml b/.github/workflows/Large-Workers-Fleet.yml new file mode 100644 index 0000000..5e3fdb2 --- /dev/null +++ b/.github/workflows/Large-Workers-Fleet.yml @@ -0,0 +1,53 @@ +name: Large-Workers-Fleet + +on: + workflow_dispatch: + schedule: + - cron: '0 */1 * * *' # every 1 hour + +jobs: + health-check: + strategy: + matrix: + runner: [ + "ubuntu-24.04-ppc64le-2xlarge", + "ubuntu-24.04-ppc64le-2xlarge-p10", + "ubuntu-24.04-ppc64le-4xlarge", + "ubuntu-24.04-ppc64le-4xlarge-p10" + ] + fail-fast: false + + runs-on: ${{ matrix.runner }} + + steps: + - uses: actions/checkout@v4 + + - name: Runner info + run: | + echo "Runner: ${{ matrix.runner }}" + uname -a + uptime + + # CPU stress test + - name: CPU test + run: timeout 25s bash -c "yes > /dev/null" || true + + # IO heavy test (larger) + - name: IO test + run: | + dd if=/dev/zero of=testfile bs=1M count=512 oflag=dsync + rm -f testfile + + # Network test + - name: Network test + run: | + ping -c 4 github.com + curl -I https://github.com + + # Memory check + - name: Memory check + run: free -h + + # Process snapshot + - name: Process snapshot + run: ps aux --sort=-%mem | head -10 diff --git a/.github/workflows/watchdog.yml b/.github/workflows/watchdog.yml new file mode 100644 index 0000000..04c7e36 --- /dev/null +++ b/.github/workflows/watchdog.yml @@ -0,0 +1,150 @@ +name: Watchdog - Multi Workflow Health Monitor + +on: + schedule: + - cron: "*/5 * * * *" # Runs every 5 minutes + workflow_dispatch: + +permissions: + actions: read + contents: write + +jobs: + monitor: + runs-on: ubuntu-latest + + defaults: + run: + shell: bash + + steps: + - name: Install dependencies + run: | + sudo apt update + sudo apt install -y gh jq + + - name: Checkout repo (for state persistence) + uses: actions/checkout@v4 + + - name: Monitor workflows + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} + run: | + set -e + + owner_repo="${{ github.repository }}" + workflows=("Default-Runners-Fleet.yml" "Large-Workers-Fleet.yml") + + state_file="watchdog_state.json" + + + if [ -f "$state_file" ]; then + state=$(cat "$state_file") + else + state='{}' + fi + + new_state='{}' + alerts="" + + for wf in "${workflows[@]}"; do + echo "Checking workflow: $wf" + + runs=$(gh api repos/$owner_repo/actions/workflows/$wf/runs --jq '.workflow_runs') + latest=$(echo "$runs" | jq '.[0]') + + if [ "$latest" = "null" ]; then + echo "No runs found for $wf" + continue + fi + + status=$(echo "$latest" | jq -r '.status // "unknown"') + conclusion=$(echo "$latest" | jq -r '.conclusion // "unknown"') + created=$(echo "$latest" | jq -r '.created_at') + html_url=$(echo "$latest" | jq -r '.html_url') + + prev=$(echo "$state" | jq -r --arg wf "$wf" '.[$wf] // "none"') + + echo "Status: $status | Conclusion: $conclusion | Prev: $prev" + + # Time calculation + now=$(date -u +"%Y-%m-%dT%H:%M:%SZ") + created_epoch=$(date -d "$created" +%s 2>/dev/null || echo 0) + now_epoch=$(date -d "$now" +%s 2>/dev/null || echo 0) + queued_minutes=$(( (now_epoch - created_epoch) / 60 )) + + # QUEUED LOGIC WITH SEVERITY + if [ "$status" = "queued" ]; then + + if [ "$queued_minutes" -ge 90 ]; then + alerts="${alerts} + 🚨 *SEVERE*: Workflow *$wf* stuck in queue for *${queued_minutes} mins* + πŸ”— View Run: ${html_url}" + + elif [ "$queued_minutes" -ge 60 ]; then + alerts="${alerts} + πŸ”΄ *HIGH*: Workflow *$wf* queued for *${queued_minutes} mins* + πŸ”— View Run: ${html_url}" + + elif [ "$queued_minutes" -ge 30 ]; then + alerts="${alerts} + 🟑 *WARNING*: Workflow *$wf* queued for *${queued_minutes} mins* + πŸ”— View Run: ${html_url}" + fi + + new_state=$(echo "$new_state" | jq --arg wf "$wf" --arg val "queued_$queued_minutes" '.[$wf]=$val') + + # FAILURE LOGIC + elif [ "$status" = "completed" ] && [ "$conclusion" = "failure" ]; then + if [ "$prev" != "failure" ]; then + alerts="${alerts} + ❌*FAILED*: Workflow *$wf* failed" + fi + new_state=$(echo "$new_state" | jq --arg wf "$wf" --arg val "failure" '.[$wf]=$val') + + # RECOVERY LOGIC + elif [ "$status" = "completed" ] && [ "$conclusion" = "success" ]; then + if [[ "$prev" == failure* || "$prev" == queued* ]]; then + alerts="${alerts} + βœ… *RECOVERED*: Workflow *$wf* is back to normal + πŸ”— View Run: ${html_url}" + fi + new_state=$(echo "$new_state" | jq --arg wf "$wf" --arg val "success" '.[$wf]=$val') + + else + new_state=$(echo "$new_state" | jq --arg wf "$wf" --arg val "$status" '.[$wf]=$val') + fi + + done + + # Send Slack alert if needed + if [ -n "$alerts" ]; then + echo -e "Alerts: $alerts" + + payload=$(jq -n --arg text "$alerts" '{ + "blocks": [ + { + "type": "header", + "text": { + "type": "plain_text", + "text": "🚨Workflow Health Alert" + } + }, + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": $text + } + } + ] + }') + + curl -s POST -H "Content-type: application/json" \ + --data "$payload" \ + "$SLACK_WEBHOOK_URL" + fi + + # Save new state + echo "$new_state" > "$state_file"