ST0x-Technology · alastairong1 · Apr 28, 2026
diff --git a/docs/ops.md b/docs/ops.md
@@ -0,0 +1,92 @@
+# Operations cheat sheet
+
+Quick journalctl + curl recipes for the deployed `rest-api` service. SSH in with `nix develop -c remote` (or `ssh root@<host>` if your key is in `roles.ssh`).
+
+## Service health
+
+```bash
+# Quick liveness probe (no auth)
+curl -sS https://api.preview.st0x.io/health | jq
+
+# Full status — includes db connectivity, raindex sync, cache_warmer
+curl -sS https://api.preview.st0x.io/health/detailed | jq
+```
+
+Key fields in `/health/detailed.cache_warmer`:
+- `running` — `false` until the warmer completes its first cycle (~15-30s after restart while caches are cold)
+- `last_cycle_ms` — should track the steady-state cycle duration; sustained > 10s suggests upstream RPC slowness
+- `seconds_since_last_complete` — should bounce between `0` and `~20` (cycle duration + REFRESH_INTERVAL); much higher means the warmer has frozen
+- `last_errors` — per-token failures during the last cycle; non-zero is worth investigating
+
+## Common journalctl queries
+
+All queries run via `ssh root@api.preview.st0x.io '...'` or after `nix develop -c remote`.
+
+### 429 rate
+
+```bash
+# Count in the last hour
+journalctl -u rest-api --since '1 hour ago' --no-pager | grep -c 'error code 429'
+
+# Per-RPC breakdown (when the backing RPC is identifiable from the error body)
+journalctl -u rest-api --since '1 hour ago' --no-pager \
+  | grep -oE 'error code -32016|error code 429|StalePrice' \
+  | sort | uniq -c
+```
+
+### Cache warmer cycles
+
+```bash
+# Last 10 cycle durations + completion timestamps
+journalctl -u rest-api --since '10 minutes ago' --no-pager \
+  | grep 'cache warmer: orders-by-token refresh complete' \
+  | sed -E 's/.*timestamp":"([^"]+)".*duration_ms":"?([0-9]+)"?.*/\1  cycle_ms=\2/' \
+  | tail -10
+```
+
+### ERROR-level rate
+
+```bash
+journalctl -u rest-api --since '5 minutes ago' --no-pager \
+  | grep -c 'level":"ERROR'
+```
+
+Most ERROR lines are benign (`No matching routes for HEAD /health` from external uptime checkers, or `task NNNN was cancelled` during graceful restart). Real signal:
+- `failed to query orders` outside a deploy window
+- `applied RPC override` should appear once on startup with the expected `url_count`
+
+### Slow requests
+
+```bash
+# Requests > 5s in the last hour (raw rocket access logs)
+journalctl -u rest-api --since '1 hour ago' --no-pager \
+  | grep 'request completed' \
+  | grep -oE 'duration_ms":[0-9]+\.[0-9]+' \
+  | awk -F: '$2 > 5000 { print }' \
+  | wc -l
+```
+
+## Smoke tests
+
+```bash
+# Run the smoke battery against the live preview
+API_KEY=<id> API_SECRET=<secret> ./scripts/smoke.sh
+
+# Override target
+API_URL=https://api.st0x.io API_KEY=... API_SECRET=... ./scripts/smoke.sh
+```
+
+The script returns non-zero on any FAIL. Run post-deploy or wire into a cron + alert. SLOW (over `LATENCY_BUDGET_MS=3000`) is reported as a warning, not a failure.
+
+## Suggested cron / external monitoring
+
+A minimal external probe (run from any machine that can reach the public hostname):
+
+```bash
+# Run every 5 minutes; alert on non-zero exit or 502/503 in the body
+*/5 * * * * cd /path/to/st0x.rest.api && \
+  API_KEY=... API_SECRET=... ./scripts/smoke.sh > /tmp/smoke.last 2>&1 || \
+  alert-channel "smoke failed: $(tail -5 /tmp/smoke.last)"
+```
+
+Higher-fidelity options (Prometheus + Grafana, Datadog, etc.) are deferred — the smoke + journalctl recipes cover most regressions for a single-instance preview.
diff --git a/scripts/smoke.sh b/scripts/smoke.sh
@@ -0,0 +1,145 @@
+#!/usr/bin/env bash
+# smoke.sh — End-to-end correctness + latency smoke tests against a deployed
+# st0x-rest-api instance. Designed to be run post-deploy or on a cron.
+#
+# Usage:
+#   API_URL=https://api.preview.st0x.io \
+#     API_KEY=<key-id> API_SECRET=<secret> \
+#     ./scripts/smoke.sh
+#
+# Exits 0 if all checks pass, non-zero otherwise. Prints a summary with
+# per-check status + latency. Uses only curl + jq.
+
+set -uo pipefail
+
+API_URL="${API_URL:-https://api.preview.st0x.io}"
+API_KEY="${API_KEY:-}"
+API_SECRET="${API_SECRET:-}"
+
+# Tokens to probe. Override via env if the registry changes.
+USDC_BASE="${SMOKE_USDC:-0x833589fcd6edb6e08f4c7c32d4f71b54bda02913}"
+SAMPLE_OWNER="${SMOKE_OWNER:-0x71b94911fd1ce621fc40970450004c544e5287a8}"
+
+# Latency budget per endpoint, in ms. Failures over budget are warnings, not
+# hard failures, so a flaky network doesn't sink CI; tune if real regressions
+# slip through.
+LATENCY_BUDGET_MS="${LATENCY_BUDGET_MS:-3000}"
+
+PASS=0
+FAIL=0
+WARN=0
+
+color() {
+    case "$1" in
+        green) printf '\033[32m%s\033[0m' "$2" ;;
+        red)   printf '\033[31m%s\033[0m' "$2" ;;
+        yellow) printf '\033[33m%s\033[0m' "$2" ;;
+        *) printf '%s' "$2" ;;
+    esac
+}
+
+# probe NAME METHOD PATH EXPECTED_STATUS [JQ_FILTER]
+# The optional JQ_FILTER must produce a non-null, non-empty value for the
+# check to pass — used to assert on response shape, not just status code.
+probe() {
+    local name="$1"
+    local method="$2"
+    local path="$3"
+    local expected_status="$4"
+    local jq_filter="${5:-}"
+    local auth_header=""
+    if [[ -n "$API_KEY" && -n "$API_SECRET" ]]; then
+        auth_header="-u $API_KEY:$API_SECRET"
+    fi
+
+    local tmp
+    tmp=$(mktemp)
+    # shellcheck disable=SC2086
+    local result
+    result=$(curl -sS -X "$method" $auth_header \
+        -o "$tmp" \
+        -w '%{http_code} %{time_total}\n' \
+        --max-time 30 \
+        "$API_URL$path" 2>&1) || true
+
+    local status time_s
+    status=$(echo "$result" | awk '{print $1}')
+    time_s=$(echo "$result" | awk '{print $2}')
+    local time_ms
+    time_ms=$(awk -v t="$time_s" 'BEGIN { printf "%d", t * 1000 }')
+
+    local check_status="FAIL"
+    local detail=""
+
+    if [[ "$status" == "$expected_status" ]]; then
+        if [[ -n "$jq_filter" ]]; then
+            if jq -e "$jq_filter" >/dev/null 2>&1 < "$tmp"; then
+                check_status="PASS"
+            else
+                check_status="FAIL"
+                detail="(shape mismatch)"
+            fi
+        else
+            check_status="PASS"
+        fi
+    else
+        body=$(head -c 200 "$tmp")
+        detail="(got $status, body: $body)"
+    fi
+
+    rm -f "$tmp"
+
+    local latency_marker=""
+    if [[ "$check_status" == "PASS" && "$time_ms" -gt "$LATENCY_BUDGET_MS" ]]; then
+        latency_marker=" $(color yellow SLOW)"
+        WARN=$((WARN + 1))
+    fi
+
+    case "$check_status" in
+        PASS)
+            printf '  [%s] %-50s %4dms%s\n' "$(color green PASS)" "$name" "$time_ms" "$latency_marker"
+            PASS=$((PASS + 1))
+            ;;
+        *)
+            printf '  [%s] %-50s %4dms %s\n' "$(color red FAIL)" "$name" "$time_ms" "$detail"
+            FAIL=$((FAIL + 1))
+            ;;
+    esac
+}
+
+echo "smoke tests against $API_URL"
+echo "  budget per check: ${LATENCY_BUDGET_MS}ms"
+echo
+
+# 1. Public endpoints (no auth)
+probe "GET /health"                          GET "/health" 200 '.status == "ok"'
+probe "GET /health/detailed"                 GET "/health/detailed" 200 '.status'
+probe "GET /health/detailed has cache_warmer" GET "/health/detailed" 200 '.cache_warmer'
+
+# 2. Protected endpoints reject missing/invalid auth
+SAVED_KEY="$API_KEY"; SAVED_SECRET="$API_SECRET"
+API_KEY="" API_SECRET=""
+probe "GET /v1/tokens (no auth)"             GET "/v1/tokens" 401
+API_KEY="$SAVED_KEY"; API_SECRET="$SAVED_SECRET"
+
+# 3. Authenticated endpoints — only run if creds are set
+if [[ -n "$API_KEY" && -n "$API_SECRET" ]]; then
+    probe "GET /v1/tokens"                       GET "/v1/tokens" 200 '.tokens | type == "array"'
+    probe "GET /v1/orders/token/{usdc}"          GET "/v1/orders/token/$USDC_BASE" 200 '.orders | type == "array" and .pagination'
+    probe "GET /v1/orders/owner/{owner}"         GET "/v1/orders/owner/$SAMPLE_OWNER" 200 '.orders | type == "array"'
+    probe "GET /v1/trades/token/{usdc}"          GET "/v1/trades/token/$USDC_BASE?pageSize=10" 200 '.trades | type == "array"'
+    probe "GET /v1/trades/{owner}"               GET "/v1/trades/$SAMPLE_OWNER?pageSize=10" 200 '.trades | type == "array"'
+    # Path validation only kicks in after auth succeeds — Rocket auth fairing
+    # runs first, so an invalid-address probe without auth would 401.
+    probe "GET /v1/orders/token/<bad>"           GET "/v1/orders/token/not-an-address" 422
+else
+    echo "  (skipping authenticated checks; set API_KEY + API_SECRET to enable)"
+fi
+
+echo
+echo "summary: $(color green "$PASS pass"), $(color red "$FAIL fail"), $(color yellow "$WARN slow")"
+
+if [[ "$FAIL" -gt 0 ]]; then
+    exit 1
+fi
+exit 0
diff --git a/scripts/uptimerobot-setup.sh b/scripts/uptimerobot-setup.sh
@@ -0,0 +1,108 @@
+#!/usr/bin/env bash
+# uptimerobot-setup.sh — Creates the 3 baseline monitors against
+# `${API_URL}` via UptimeRobot's REST API. Run once per environment
+# (preview, prod). Re-running creates duplicates, so check the dashboard
+# first if you're not sure whether they already exist.
+#
+# Usage:
+#   UPTIMEROBOT_API_KEY=<your-main-api-key> ./scripts/uptimerobot-setup.sh
+#
+#   # Override target (default: https://api.preview.st0x.io)
+#   API_URL=https://api.st0x.io \
+#     UPTIMEROBOT_API_KEY=... ./scripts/uptimerobot-setup.sh
+#
+#   # Attach an existing alert contact (Telegram, email, etc.) at creation:
+#   UPTIMEROBOT_API_KEY=... \
+#     ALERT_CONTACT_ID=8253505 \
+#     ./scripts/uptimerobot-setup.sh
+#
+# Get your API key from: https://uptimerobot.com/integrations/ → "Main API Key".
+
+set -uo pipefail
+
+: "${UPTIMEROBOT_API_KEY:?UPTIMEROBOT_API_KEY is required}"
+API_URL="${API_URL:-https://api.preview.st0x.io}"
+INTERVAL_SECONDS="${INTERVAL:-300}"  # 5 minutes (free-tier minimum)
+# Optional: attach an existing alert contact (e.g. a Telegram integration).
+# Discover candidate IDs with:
+#   curl -sS -X POST -d api_key=$UPTIMEROBOT_API_KEY -d format=json \
+#     https://api.uptimerobot.com/v2/getAlertContacts | jq '.alert_contacts'
+ALERT_CONTACT_ID="${ALERT_CONTACT_ID:-}"
+
+# Use the full hostname as the friendly-name prefix so alerts (especially
+# Telegram pushes that show only the friendly_name) immediately identify
+# which environment fired. Override with FRIENDLY_LABEL if you'd rather
+# something shorter.
+DEFAULT_LABEL=$(echo "$API_URL" | sed -E 's|https?://||; s|/.*$||')
+LABEL="${FRIENDLY_LABEL:-$DEFAULT_LABEL}"
+
+echo "Creating UptimeRobot monitors for $API_URL (label: $LABEL)"
+echo
+
+# create_monitor NAME URL TYPE THRESHOLD_MIN [KEYWORD]
+# TYPE: 1 = HTTP(s) status, 2 = HTTP(s) keyword
+# THRESHOLD_MIN: minutes of detected down before alerting (0 = immediate).
+create_monitor() {
+    local name="$1"
+    local url="$2"
+    local type="$3"
+    local threshold="$4"
+    local keyword="${5:-}"
+
+    local args=(
+        --data-urlencode "api_key=$UPTIMEROBOT_API_KEY"
+        --data-urlencode "format=json"
+        --data-urlencode "friendly_name=$name"
+        --data-urlencode "url=$url"
+        --data-urlencode "type=$type"
+        --data-urlencode "interval=$INTERVAL_SECONDS"
+    )
+
+    if [[ -n "$ALERT_CONTACT_ID" ]]; then
+        args+=(--data-urlencode "alert_contacts=${ALERT_CONTACT_ID}_${threshold}_0")
+    fi
+
+    if [[ "$type" == "2" && -n "$keyword" ]]; then
+        # 1 = "Exists" — alert when keyword is NOT found in body.
+        args+=(
+            --data-urlencode "keyword_type=1"
+            --data-urlencode "keyword_case_type=0"
+            --data-urlencode "keyword_value=$keyword"
+        )
+    fi
+
+    local resp
+    resp=$(curl -sS -X POST "${args[@]}" \
+        https://api.uptimerobot.com/v2/newMonitor)
+
+    local stat id
+    stat=$(echo "$resp" | jq -r '.stat // "unknown"')
+    if [[ "$stat" == "ok" ]]; then
+        id=$(echo "$resp" | jq -r '.monitor.id // "?"')
+        echo "  [ok] $name (id=$id, threshold=${threshold}min)"
+    else
+        echo "  [fail] $name"
+        echo "    response: $resp"
+    fi
+}
+
+# Threshold 0 → page immediately on first detected failure (hard down).
+# Threshold 5 → wait one extra check interval before paging (avoids
+# flapping during deploy restarts and the post-restart cache-warmer
+# transient).
+create_monitor "$LABEL — liveness — /health" \
+    "$API_URL/health" 1 0
+
+create_monitor "$LABEL — component health — /health/detailed status=ok" \
+    "$API_URL/health/detailed" 2 5 '"status":"ok"'
+
+create_monitor "$LABEL — cache warmer — /health/detailed running=true" \
+    "$API_URL/health/detailed" 2 5 '"running":true'
+
+echo
+if [[ -n "$ALERT_CONTACT_ID" ]]; then
+    echo "Alert contact $ALERT_CONTACT_ID attached to all 3 monitors."
+else
+    echo "No ALERT_CONTACT_ID set — monitors won't page until you wire up alert"
+    echo "contacts via the dashboard or re-run with ALERT_CONTACT_ID set."
+fi