diff --git a/.claude/commands/auto-review.md b/.claude/commands/auto-review.md index b890bab..fd54bac 100644 --- a/.claude/commands/auto-review.md +++ b/.claude/commands/auto-review.md @@ -18,6 +18,7 @@ Perform a comprehensive code review for the OCI automation project covering both - Documentation quality and code comments - Adherence to coding standards and conventions - Verify that README.md and docs are updated for any new features or config changes +- **Linter Policy Compliance**: Ensure any new linters focus on code quality/security/functional issues, NOT arbitrary style rules ### 2. Security - Check for potential security vulnerabilities diff --git a/.github/workflows/infrastructure-deployment.yml b/.github/workflows/infrastructure-deployment.yml index 1e24dd0..6d02875 100644 --- a/.github/workflows/infrastructure-deployment.yml +++ b/.github/workflows/infrastructure-deployment.yml @@ -62,7 +62,7 @@ concurrency: env: # Global environment variables DEBUG: ${{ inputs.verbose_output && 'true' || 'false' }} - ENABLE_NOTIFICATIONS: ${{ inputs.send_notifications && 'true' || 'false' }} + ENABLE_NOTIFICATIONS: ${{ github.event_name == 'workflow_dispatch' && (inputs.send_notifications && 'true' || 'false') || 'true' }} # Enable instance check by default to use state management cache (can be overridden manually) CHECK_EXISTING_INSTANCE: ${{ github.event_name == 'workflow_dispatch' && (inputs.check_existing_instance && 'true' || 'false') || 'true' }} # Suppress OCI CLI file permissions warnings diff --git a/.github/workflows/super-linter.yml b/.github/workflows/super-linter.yml index 513e21e..bdbd171 100644 --- a/.github/workflows/super-linter.yml +++ b/.github/workflows/super-linter.yml @@ -31,8 +31,14 @@ jobs: VALIDATE_ALL_CODEBASE: false DEFAULT_BRANCH: "master" GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Disable all prettier validators (style-only enforcement) VALIDATE_YAML_PRETTIER: false VALIDATE_JSON_PRETTIER: false + VALIDATE_MARKDOWN_PRETTIER: false + VALIDATE_CSS_PRETTIER: false + VALIDATE_HTML_PRETTIER: false + VALIDATE_JAVASCRIPT_PRETTIER: false + VALIDATE_TYPESCRIPT_PRETTIER: false # Disable pure style checkers VALIDATE_SHELL_SHFMT: false # Configure shellcheck to only show errors/warnings (not style) diff --git a/.markdownlint.json b/.markdownlint.json new file mode 100644 index 0000000..6a5a9f6 --- /dev/null +++ b/.markdownlint.json @@ -0,0 +1,5 @@ +{ + "MD026": false, + "MD013": false, + "MD033": false +} \ No newline at end of file diff --git a/CLAUDE.md b/CLAUDE.md index 05670f9..ffe693d 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -33,10 +33,31 @@ oci_args+=("--read-timeout" "15") # 15s vs 60s default ### Error Classification (scripts/utils.sh) ```bash -CAPACITY: "capacity|quota|limit|429" → Schedule retry (treat as success) -DUPLICATE: "already exists" → Success -TRANSIENT: "internal|network|timeout" → Retry 3x same AD, then next AD -AUTH/CONFIG: "authentication|invalid.*ocid" → Alert user immediately +USER_LIMIT_REACHED: "limitexceeded.*core.*count" → Cache 24h, exit 5 +ORACLE_CAPACITY_UNAVAILABLE: "out of host capacity" → Retry, exit 2 +CAPACITY: "capacity|quota|limit|429" → Retry (success) +DUPLICATE: "already exists" → Success +TRANSIENT: "internal|network|timeout" → 3x retry, next AD +AUTH/CONFIG: "authentication|invalid.*ocid" → Alert user +``` + +### Error-Driven Limit Detection (PR #69) +Prevents 4,320+ monthly futile API calls. 24h cache TTL, exit code 5. +```bash +# Pre-flight cache check (launch-parallel.sh) +get_cached_limit_state "${A1_FLEX_CONFIG[SHAPE]}" && should_launch_a1=false + +# Auto-detect from failures (launch-instance.sh) +"USER_LIMIT_REACHED") set_cached_limit_state "${OCI_SHAPE}" "true"; return 5 ;; +``` + +### Architecture-Aware Timeout Handling (PR #70) +Preserves capacity/limit error codes (2, 5). Only overrides generic failures (1). +```bash +# Only timeout shapes that were launched and failed generically +if [[ "$should_launch_a1" == true && $STATUS_A1 -eq 1 ]]; then + STATUS_A1=$EXIT_TIMEOUT_ERROR +fi ``` ### Parallel Execution Pattern (launch-parallel.sh) @@ -138,6 +159,44 @@ gh run watch - **Shape requirements**: Flexible shapes need `--shape-config` parameter - **Never remove** OCI CLI optimization flags - they provide 93% performance improvement +## Telegram Notification Policy + +### SEND notifications for: +- ✅ **SUCCESS**: Instance creation completed (with instance details) +- ❌ **FAILURE**: Authentication/configuration errors requiring user action +- 🚨 **CRITICAL**: System failures requiring immediate attention +- ❌ **ERROR**: Unexpected failures needing investigation + +### DO NOT send notifications for: +- ❌ User limits reached (expected free tier behavior - normal operation) +- ❌ Oracle capacity unavailable (expected operational condition - will retry) +- ❌ Rate limiting (standard cloud provider behavior - will retry) +- ❌ Instance already exists (expected when using state management cache) +- ❌ Any condition that resolves through automated retry cycles + +### Notification Behavior: +- **Scheduled runs**: Notifications ENABLED by default (monitor automation) +- **Manual runs**: User can toggle notifications via workflow dispatch +- **Expected conditions**: Never generate notifications (silent operation) +- **Actual failures**: Always generate notifications (require attention) + +### Philosophy: +**Notify for successes and actionable failures. Never notify for expected operational conditions.** +Expected conditions (limits, capacity constraints) are normal automation behavior that resolve through retry cycles. + +## Linter Configuration Policy + +### Core Principle +**Linters in this project MUST NOT enforce arbitrary style rules.** Focus on code quality, security, and functional correctness only. + +### Disabled Style Rules +- **All Prettier validators**: Disabled to prevent style conflicts with intentional formatting +- **Markdown style rules**: MD026 (trailing punctuation), MD013 (line length), MD033 (HTML tags) +- **Shell formatting**: VALIDATE_SHELL_SHFMT disabled (existing) + +### Philosophy +Linters should catch bugs, security issues, and functional problems - not enforce subjective style preferences that reduce documentation readability. + ## Oracle Cloud Specifics - **Flexible shapes need --shape-config parameter**: `{"ocpus": 4, "memoryInGBs": 24}` diff --git a/README.md b/README.md index 50c70c5..877f537 100644 --- a/README.md +++ b/README.md @@ -9,8 +9,10 @@ Automated provisioning of Oracle Cloud free-tier instances (A1.Flex ARM & E2.1.M ## Features - **Parallel provisioning** of both instance types (~20s execution) +- **Error-driven limit detection** (saves 4,320+ monthly API calls) - **Multi-AD cycling** for higher success rates - **Smart error handling** with transient error retry +- **Architecture-aware timeout handling** (preserves capacity errors) - **Telegram notifications** on success/failure - **Secure credential management** via GitHub Secrets - **93% performance optimization** through CLI tuning @@ -122,6 +124,13 @@ chmod +x scripts/*.sh - **Multi-region support**: Works with any OCI region - **Comprehensive logging**: Structured output for debugging +## Documentation + +- **[CLAUDE.md](CLAUDE.md)** - Complete project architecture, patterns, and development guide +- **[Limits Management](docs/limits-management.md)** - Free tier limit detection and cache management +- **[Notification Policy](CLAUDE.md#telegram-notification-policy)** - Clear guidelines on when notifications are sent +- **[Linter Configuration](CLAUDE.md#linter-configuration-policy)** - Code quality focus over arbitrary style rules + ## License MIT License - See [LICENSE](LICENSE) file for details. diff --git a/docs/limits-management.md b/docs/limits-management.md new file mode 100644 index 0000000..b8f213e --- /dev/null +++ b/docs/limits-management.md @@ -0,0 +1,28 @@ +# Free Tier Limits Management + +Error-driven limit detection prevents 4,320+ monthly futile API calls via 24h cache. + +## Commands + +```bash +# Check limit status +./scripts/state-manager.sh limit-status + +# Check specific shape +./scripts/state-manager.sh check-limit "VM.Standard.E2.1.Micro" + +# Clear all cached limits +./scripts/state-manager.sh clear-limits + +# Manual limit override +./scripts/state-manager.sh set-limit "VM.Standard.A1.Flex" false +``` + +## Free Tier Limits +- **E2.1.Micro**: 2 instances max +- **A1.Flex**: 4 OCPUs total, 24GB total + +## Behavior +- Exit code 5: USER_LIMIT_REACHED (cached 24h) +- Exit code 2: ORACLE_CAPACITY_UNAVAILABLE (retry) +- Pre-flight cache check skips shapes at known limits \ No newline at end of file diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index 665a6d1..0113403 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -141,6 +141,18 @@ This is **NOT a failure** - it's Oracle's normal response when free tier capacit - Delete unused boot volumes - Check compute instance pools +#### Problem: "User limit reached" (Exit Code 5) +**Expected Behavior:** NOT a failure - intelligent limit detection working correctly. + +**Solutions:** +```bash +# Check status +./scripts/state-manager.sh limit-status + +# Clear cache (after terminating instances) +./scripts/state-manager.sh clear-limits +``` + #### Problem: "Shape not supported in availability domain" **Symptoms:** - Specific error about shape availability @@ -263,10 +275,11 @@ export OCI_PROXY_URL="invalid-format" ```bash # Test standardized error codes source scripts/utils.sh -get_exit_code_for_error_type "CAPACITY" # Should return 2 -get_exit_code_for_error_type "AUTH" # Should return 3 -get_exit_code_for_error_type "NETWORK" # Should return 4 -get_exit_code_for_error_type "UNKNOWN" # Should return 1 +get_exit_code_for_error_type "USER_LIMIT_REACHED" # Should return 5 +get_exit_code_for_error_type "CAPACITY" # Should return 2 +get_exit_code_for_error_type "AUTH" # Should return 3 +get_exit_code_for_error_type "NETWORK" # Should return 4 +get_exit_code_for_error_type "UNKNOWN" # Should return 1 ``` ### 📊 Performance Monitoring diff --git a/scripts/launch-instance.sh b/scripts/launch-instance.sh index a345758..2abbd45 100755 --- a/scripts/launch-instance.sh +++ b/scripts/launch-instance.sh @@ -388,8 +388,7 @@ launch_instance() { log_performance_metric "USER_LIMIT_REACHED" "${OCI_SHAPE:-unknown}" "$((ad_index + 1))" "$max_attempts" "TERMINAL" record_ad_result "$current_ad" "user_limit_reached" "$error_type" - # Send appropriate notification (not a failure, but informational) - send_telegram_notification "info" "Free tier limit reached for ${OCI_SHAPE:-unknown}. Consider managing existing instances." + # No notification needed - user limits are expected operational conditions # Exit with the specific user limit error code return "$OCI_EXIT_USER_LIMIT_ERROR" @@ -664,7 +663,7 @@ handle_launch_error_with_ad() { ;; "DUPLICATE") log_info "Instance with this name already exists. Skipping creation." - send_telegram_notification "info" "OCI instance already exists: ${INSTANCE_DISPLAY_NAME}" + # No notification needed - instance exists is an expected condition when using state management echo "DUPLICATE" return 0 ;; diff --git a/scripts/launch-parallel.sh b/scripts/launch-parallel.sh index 137b714..6efb4c0 100755 --- a/scripts/launch-parallel.sh +++ b/scripts/launch-parallel.sh @@ -532,13 +532,7 @@ main() { log_info "User limit(s) reached for $user_limit_failures shape(s) - no further attempts needed" log_info "Consider managing existing instances to free capacity for new deployments" - # Send informational notification if enabled - if [[ "${ENABLE_NOTIFICATIONS:-}" == "true" ]]; then - local limit_message="User limits reached" - [[ $STATUS_A1 -eq 5 ]] && limit_message="${limit_message} (A1.Flex: 4/4 OCPUs)" - [[ $STATUS_E2 -eq 5 ]] && limit_message="${limit_message} (E2.Micro: 2/2 instances)" - send_telegram_notification "info" "$limit_message - consider managing existing instances" - fi + # No notification needed - user limits are expected operational conditions return 0 # User limits are not failures - they're expected behavior elif [[ $capacity_failures -eq 2 ]]; then diff --git a/scripts/notify.sh b/scripts/notify.sh index 1fee3e6..f9ba892 100755 --- a/scripts/notify.sh +++ b/scripts/notify.sh @@ -157,20 +157,8 @@ notify_instance_created() { send_telegram_notification_with_retry "success" "$message" } -# Send capacity error notification (info level since it's expected) -notify_capacity_unavailable() { - local shape="${OCI_SHAPE:-unknown}" - local ad="${OCI_AD:-unknown}" - - local message="Oracle Cloud capacity currently unavailable. - -**Details:** -• Shape: $shape -• Availability Domain: $ad -• Action: Will retry on next scheduled run" - - send_telegram_notification "info" "$message" -} +# notify_capacity_unavailable() function removed - capacity issues are expected operational conditions +# and should not generate notifications per the notification policy # Send configuration error notification notify_configuration_error() {