Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 58 additions & 0 deletions .github/workflows/Default-Runners-Fleet.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
name: Default-Runners-Fleet

on:
pull_request:
branches: [main]
push:
branches: [main]
workflow_dispatch:
schedule:
- cron: '*/30 * * * *'

jobs:
build:
strategy:
matrix:
runner: [
"ubuntu-24.04-ppc64le",
"ubuntu-24.04-ppc64le-p10",
"ubuntu-24.04-s390x"
]
fail-fast: false

runs-on: ${{ matrix.runner }}

steps:
- uses: actions/checkout@v4

- name: Runner info
run: |
echo "Runner: ${{ matrix.runner }}"
uname -a
uptime

# CPU test
- name: CPU test
run: timeout 20s bash -c "yes > /dev/null" || true

# IO test
- name: IO test
run: |
set -e
dd if=/dev/zero of=testfile bs=1M count=256 oflag=dsync
rm -f testfile

# Network test
- name: Network test
run: |
set -e
ping -c 3 github.com
curl -I https://github.com

# Memory
- name: Memory check
run: free -h

# Process check
- name: Process snapshot
run: ps aux --sort=-%mem | head -10
53 changes: 53 additions & 0 deletions .github/workflows/Large-Workers-Fleet.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
name: Large-Workers-Fleet

on:
workflow_dispatch:
schedule:
- cron: '0 */1 * * *' # every 1 hour

jobs:
health-check:
strategy:
matrix:
runner: [
"ubuntu-24.04-ppc64le-2xlarge",
"ubuntu-24.04-ppc64le-2xlarge-p10",
"ubuntu-24.04-ppc64le-4xlarge",
"ubuntu-24.04-ppc64le-4xlarge-p10"
]
fail-fast: false

runs-on: ${{ matrix.runner }}

steps:
- uses: actions/checkout@v4

- name: Runner info
run: |
echo "Runner: ${{ matrix.runner }}"
uname -a
uptime

# CPU stress test
- name: CPU test
run: timeout 25s bash -c "yes > /dev/null" || true

# IO heavy test (larger)
- name: IO test
run: |
dd if=/dev/zero of=testfile bs=1M count=512 oflag=dsync
rm -f testfile

# Network test
- name: Network test
run: |
ping -c 4 github.com
curl -I https://github.com

# Memory check
- name: Memory check
run: free -h

# Process snapshot
- name: Process snapshot
run: ps aux --sort=-%mem | head -10
150 changes: 150 additions & 0 deletions .github/workflows/watchdog.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
name: Watchdog - Multi Workflow Health Monitor

on:
schedule:
- cron: "*/5 * * * *" # Runs every 5 minutes
workflow_dispatch:

permissions:
actions: read
contents: write

jobs:
monitor:
runs-on: ubuntu-latest

defaults:
run:
shell: bash

steps:
- name: Install dependencies
run: |
sudo apt update
sudo apt install -y gh jq

- name: Checkout repo (for state persistence)
uses: actions/checkout@v4

- name: Monitor workflows
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
run: |
set -e

owner_repo="${{ github.repository }}"
workflows=("Default-Runners-Fleet.yml" "Large-Workers-Fleet.yml")

state_file="watchdog_state.json"


if [ -f "$state_file" ]; then
state=$(cat "$state_file")
else
state='{}'
fi

new_state='{}'
alerts=""

for wf in "${workflows[@]}"; do
echo "Checking workflow: $wf"

runs=$(gh api repos/$owner_repo/actions/workflows/$wf/runs --jq '.workflow_runs')
latest=$(echo "$runs" | jq '.[0]')

if [ "$latest" = "null" ]; then
echo "No runs found for $wf"
continue
fi

status=$(echo "$latest" | jq -r '.status // "unknown"')
conclusion=$(echo "$latest" | jq -r '.conclusion // "unknown"')
created=$(echo "$latest" | jq -r '.created_at')
html_url=$(echo "$latest" | jq -r '.html_url')

prev=$(echo "$state" | jq -r --arg wf "$wf" '.[$wf] // "none"')

echo "Status: $status | Conclusion: $conclusion | Prev: $prev"

# Time calculation
now=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
created_epoch=$(date -d "$created" +%s 2>/dev/null || echo 0)
now_epoch=$(date -d "$now" +%s 2>/dev/null || echo 0)
queued_minutes=$(( (now_epoch - created_epoch) / 60 ))

# QUEUED LOGIC WITH SEVERITY
if [ "$status" = "queued" ]; then

if [ "$queued_minutes" -ge 90 ]; then
alerts="${alerts}
🚨 *SEVERE*: Workflow *$wf* stuck in queue for *${queued_minutes} mins*
🔗 View Run: ${html_url}"

elif [ "$queued_minutes" -ge 60 ]; then
alerts="${alerts}
🔴 *HIGH*: Workflow *$wf* queued for *${queued_minutes} mins*
🔗 View Run: ${html_url}"

elif [ "$queued_minutes" -ge 30 ]; then
alerts="${alerts}
🟡 *WARNING*: Workflow *$wf* queued for *${queued_minutes} mins*
🔗 View Run: ${html_url}"
fi

new_state=$(echo "$new_state" | jq --arg wf "$wf" --arg val "queued_$queued_minutes" '.[$wf]=$val')

# FAILURE LOGIC
elif [ "$status" = "completed" ] && [ "$conclusion" = "failure" ]; then
if [ "$prev" != "failure" ]; then
alerts="${alerts}
❌*FAILED*: Workflow *$wf* failed"
fi
new_state=$(echo "$new_state" | jq --arg wf "$wf" --arg val "failure" '.[$wf]=$val')

# RECOVERY LOGIC
elif [ "$status" = "completed" ] && [ "$conclusion" = "success" ]; then
if [[ "$prev" == failure* || "$prev" == queued* ]]; then
alerts="${alerts}
✅ *RECOVERED*: Workflow *$wf* is back to normal
🔗 View Run: ${html_url}"
fi
new_state=$(echo "$new_state" | jq --arg wf "$wf" --arg val "success" '.[$wf]=$val')

else
new_state=$(echo "$new_state" | jq --arg wf "$wf" --arg val "$status" '.[$wf]=$val')
fi

done

# Send Slack alert if needed
if [ -n "$alerts" ]; then
echo -e "Alerts: $alerts"

payload=$(jq -n --arg text "$alerts" '{
"blocks": [
{
"type": "header",
"text": {
"type": "plain_text",
"text": "🚨Workflow Health Alert"
}
},
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": $text
}
}
]
}')

curl -s POST -H "Content-type: application/json" \
--data "$payload" \
"$SLACK_WEBHOOK_URL"
fi

# Save new state
echo "$new_state" > "$state_file"
Loading